xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 30602db00db74b7e41a0c75e517aefe6711423f0)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 
18 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22 
23   typedef enum {
24       CUSPARSE_MV_ALG_DEFAULT = 0,
25       CUSPARSE_COOMV_ALG      = 1,
26       CUSPARSE_CSRMV_ALG1     = 2,
27       CUSPARSE_CSRMV_ALG2     = 3
28   } cusparseSpMVAlg_t;
29 
30   typedef enum {
31       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37       CUSPARSE_SPMM_COO_ALG1    = 1,
38       CUSPARSE_SPMM_COO_ALG2    = 2,
39       CUSPARSE_SPMM_COO_ALG3    = 3,
40       CUSPARSE_SPMM_COO_ALG4    = 5,
41       CUSPARSE_SPMM_CSR_ALG1    = 4,
42       CUSPARSE_SPMM_CSR_ALG2    = 6,
43   } cusparseSpMMAlg_t;
44 
45   typedef enum {
46       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48   } cusparseCsr2CscAlg_t;
49   */
50   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53 #endif
54 
55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58 
59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77 
78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83 
84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
87 
88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 
93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100   cusparsestruct->stream = stream;
101   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102   PetscFunctionReturn(0);
103 }
104 
105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106 {
107   cusparseStatus_t   stat;
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
115     }
116     cusparsestruct->handle = handle;
117   }
118   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126   PetscErrorCode     ierr;
127 
128   PetscFunctionBegin;
129   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
130   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
136 {
137   PetscFunctionBegin;
138   *type = MATSOLVERCUSPARSE;
139   PetscFunctionReturn(0);
140 }
141 
142 /*MC
143   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148   algorithms are not recommended. This class does NOT support direct solver operations.
149 
150   Level: beginner
151 
152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153 M*/
154 
155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
156 {
157   PetscErrorCode ierr;
158   PetscInt       n = A->rmap->n;
159 
160   PetscFunctionBegin;
161   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
163   (*B)->factortype = ftype;
164   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
165 
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
172     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
173   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
174     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
175     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
176     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
178   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
179 
180   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
181   (*B)->canuseordering = PETSC_TRUE;
182   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187 {
188   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
189 
190   PetscFunctionBegin;
191   switch (op) {
192   case MAT_CUSPARSE_MULT:
193     cusparsestruct->format = format;
194     break;
195   case MAT_CUSPARSE_ALL:
196     cusparsestruct->format = format;
197     break;
198   default:
199     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 /*@
205    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206    operation. Only the MatMult operation can use different GPU storage formats
207    for MPIAIJCUSPARSE matrices.
208    Not Collective
209 
210    Input Parameters:
211 +  A - Matrix of type SEQAIJCUSPARSE
212 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214 
215    Output Parameter:
216 
217    Level: intermediate
218 
219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220 @*/
221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222 {
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   switch (op) {
237     case MAT_FORM_EXPLICIT_TRANSPOSE:
238       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
240       A->form_explicit_transpose = flg;
241       break;
242     default:
243       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
244       break;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250 
251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252 {
253   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254   IS             isrow = b->row,iscol = b->col;
255   PetscBool      row_identity,col_identity;
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261   B->offloadmask = PETSC_OFFLOAD_CPU;
262   /* determine which version of MatSolve needs to be used. */
263   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265   if (row_identity && col_identity) {
266     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268     B->ops->matsolve = NULL;
269     B->ops->matsolvetranspose = NULL;
270   } else {
271     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273     B->ops->matsolve = NULL;
274     B->ops->matsolvetranspose = NULL;
275   }
276 
277   /* get the triangular factors */
278   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279   PetscFunctionReturn(0);
280 }
281 
282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283 {
284   PetscErrorCode           ierr;
285   MatCUSPARSEStorageFormat format;
286   PetscBool                flg;
287   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
288 
289   PetscFunctionBegin;
290   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
291   if (A->factortype == MAT_FACTOR_NONE) {
292     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295 
296     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 
305     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308 
309     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312    #endif
313   }
314   ierr = PetscOptionsTail();CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319 {
320   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321   PetscErrorCode               ierr;
322 
323   PetscFunctionBegin;
324   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
325   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
326   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333   PetscErrorCode               ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
337   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
338   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339   PetscFunctionReturn(0);
340 }
341 
342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343 {
344   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345   PetscErrorCode               ierr;
346 
347   PetscFunctionBegin;
348   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355 {
356   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357   PetscErrorCode               ierr;
358 
359   PetscFunctionBegin;
360   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363   PetscFunctionReturn(0);
364 }
365 
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiLo, *AjLo;
376   PetscInt                          i,nz, nzLower, offset, rowOffset;
377   PetscErrorCode                    ierr;
378   cudaError_t                       cerr;
379 
380   PetscFunctionBegin;
381   if (!n) PetscFunctionReturn(0);
382   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383     try {
384       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385       nzLower=n+ai[n]-ai[1];
386       if (!loTriFactor) {
387         PetscScalar                       *AALo;
388 
389         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
390 
391         /* Allocate Space for the lower triangular matrix */
392         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
394 
395         /* Fill the lower triangular matrix */
396         AiLo[0]  = (PetscInt) 0;
397         AiLo[n]  = nzLower;
398         AjLo[0]  = (PetscInt) 0;
399         AALo[0]  = (MatScalar) 1.0;
400         v        = aa;
401         vi       = aj;
402         offset   = 1;
403         rowOffset= 1;
404         for (i=1; i<n; i++) {
405           nz = ai[i+1] - ai[i];
406           /* additional 1 for the term on the diagonal */
407           AiLo[i]    = rowOffset;
408           rowOffset += nz+1;
409 
410           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
412 
413           offset      += nz;
414           AjLo[offset] = (PetscInt) i;
415           AALo[offset] = (MatScalar) 1.0;
416           offset      += 1;
417 
418           v  += nz;
419           vi += nz;
420         }
421 
422         /* allocate space for the triangular factor information */
423         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425         /* Create the matrix description */
426         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430        #else
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432        #endif
433         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435 
436         /* set the operation */
437         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439         /* set the matrix */
440         loTriFactor->csrMat = new CsrMatrix;
441         loTriFactor->csrMat->num_rows = n;
442         loTriFactor->csrMat->num_cols = n;
443         loTriFactor->csrMat->num_entries = nzLower;
444 
445         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447 
448         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450 
451         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453 
454         /* Create the solve analysis information */
455         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464       #endif
465 
466         /* perform the solve analysis */
467         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473                                #endif
474 );CHKERRCUSPARSE(stat);
475         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477 
478         /* assign the pointer */
479         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480         loTriFactor->AA_h = AALo;
481         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       } else { /* update values only */
485         if (!loTriFactor->AA_h) {
486           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487         }
488         /* Fill the lower triangular matrix */
489         loTriFactor->AA_h[0]  = 1.0;
490         v        = aa;
491         vi       = aj;
492         offset   = 1;
493         for (i=1; i<n; i++) {
494           nz = ai[i+1] - ai[i];
495           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496           offset      += nz;
497           loTriFactor->AA_h[offset] = 1.0;
498           offset      += 1;
499           v  += nz;
500         }
501         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503       }
504     } catch(char *ex) {
505       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506     }
507   }
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512 {
513   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514   PetscInt                          n = A->rmap->n;
515   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517   cusparseStatus_t                  stat;
518   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519   const MatScalar                   *aa = a->a,*v;
520   PetscInt                          *AiUp, *AjUp;
521   PetscInt                          i,nz, nzUpper, offset;
522   PetscErrorCode                    ierr;
523   cudaError_t                       cerr;
524 
525   PetscFunctionBegin;
526   if (!n) PetscFunctionReturn(0);
527   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528     try {
529       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530       nzUpper = adiag[0]-adiag[n];
531       if (!upTriFactor) {
532         PetscScalar *AAUp;
533 
534         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
535 
536         /* Allocate Space for the upper triangular matrix */
537         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
539 
540         /* Fill the upper triangular matrix */
541         AiUp[0]=(PetscInt) 0;
542         AiUp[n]=nzUpper;
543         offset = nzUpper;
544         for (i=n-1; i>=0; i--) {
545           v  = aa + adiag[i+1] + 1;
546           vi = aj + adiag[i+1] + 1;
547 
548           /* number of elements NOT on the diagonal */
549           nz = adiag[i] - adiag[i+1]-1;
550 
551           /* decrement the offset */
552           offset -= (nz+1);
553 
554           /* first, set the diagonal elements */
555           AjUp[offset] = (PetscInt) i;
556           AAUp[offset] = (MatScalar)1./v[nz];
557           AiUp[i]      = AiUp[i+1] - (nz+1);
558 
559           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
561         }
562 
563         /* allocate space for the triangular factor information */
564         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
566 
567         /* Create the matrix description */
568         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572        #else
573         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574        #endif
575         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577 
578         /* set the operation */
579         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580 
581         /* set the matrix */
582         upTriFactor->csrMat = new CsrMatrix;
583         upTriFactor->csrMat->num_rows = n;
584         upTriFactor->csrMat->num_cols = n;
585         upTriFactor->csrMat->num_entries = nzUpper;
586 
587         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589 
590         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592 
593         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595 
596         /* Create the solve analysis information */
597         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606       #endif
607 
608         /* perform the solve analysis */
609         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615                                #endif
616 );CHKERRCUSPARSE(stat);
617         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619 
620         /* assign the pointer */
621         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622         upTriFactor->AA_h = AAUp;
623         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626       } else {
627         if (!upTriFactor->AA_h) {
628           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629         }
630         /* Fill the upper triangular matrix */
631         offset = nzUpper;
632         for (i=n-1; i>=0; i--) {
633           v  = aa + adiag[i+1] + 1;
634 
635           /* number of elements NOT on the diagonal */
636           nz = adiag[i] - adiag[i+1]-1;
637 
638           /* decrement the offset */
639           offset -= (nz+1);
640 
641           /* first, set the diagonal elements */
642           upTriFactor->AA_h[offset] = 1./v[nz];
643           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644         }
645         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647       }
648     } catch(char *ex) {
649       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650     }
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656 {
657   PetscErrorCode               ierr;
658   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660   IS                           isrow = a->row,iscol = a->icol;
661   PetscBool                    row_identity,col_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
668 
669   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670   cusparseTriFactors->nnz=a->nz;
671 
672   A->offloadmask = PETSC_OFFLOAD_BOTH;
673   /* lower triangular indices */
674   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676     const PetscInt *r;
677 
678     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680     cusparseTriFactors->rpermIndices->assign(r, r+n);
681     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683   }
684 
685   /* upper triangular indices */
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688     const PetscInt *c;
689 
690     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692     cusparseTriFactors->cpermIndices->assign(c, c+n);
693     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700 {
701   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705   cusparseStatus_t                  stat;
706   PetscErrorCode                    ierr;
707   cudaError_t                       cerr;
708   PetscInt                          *AiUp, *AjUp;
709   PetscScalar                       *AAUp;
710   PetscScalar                       *AALo;
711   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714   const MatScalar                   *aa = b->a,*v;
715 
716   PetscFunctionBegin;
717   if (!n) PetscFunctionReturn(0);
718   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719     try {
720       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722       if (!upTriFactor && !loTriFactor) {
723         /* Allocate Space for the upper triangular matrix */
724         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726 
727         /* Fill the upper triangular matrix */
728         AiUp[0]=(PetscInt) 0;
729         AiUp[n]=nzUpper;
730         offset = 0;
731         for (i=0; i<n; i++) {
732           /* set the pointers */
733           v  = aa + ai[i];
734           vj = aj + ai[i];
735           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736 
737           /* first, set the diagonal elements */
738           AjUp[offset] = (PetscInt) i;
739           AAUp[offset] = (MatScalar)1.0/v[nz];
740           AiUp[i]      = offset;
741           AALo[offset] = (MatScalar)1.0/v[nz];
742 
743           offset+=1;
744           if (nz>0) {
745             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747             for (j=offset; j<offset+nz; j++) {
748               AAUp[j] = -AAUp[j];
749               AALo[j] = AAUp[j]/v[nz];
750             }
751             offset+=nz;
752           }
753         }
754 
755         /* allocate space for the triangular factor information */
756         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758 
759         /* Create the matrix description */
760         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764        #else
765         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766        #endif
767         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769 
770         /* set the matrix */
771         upTriFactor->csrMat = new CsrMatrix;
772         upTriFactor->csrMat->num_rows = A->rmap->n;
773         upTriFactor->csrMat->num_cols = A->cmap->n;
774         upTriFactor->csrMat->num_entries = a->nz;
775 
776         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778 
779         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781 
782         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784 
785         /* set the operation */
786         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787 
788         /* Create the solve analysis information */
789         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798       #endif
799 
800         /* perform the solve analysis */
801         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807                                 #endif
808 );CHKERRCUSPARSE(stat);
809         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811 
812         /* assign the pointer */
813         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814 
815         /* allocate space for the triangular factor information */
816         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818 
819         /* Create the matrix description */
820         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824        #else
825         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826        #endif
827         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829 
830         /* set the operation */
831         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832 
833         /* set the matrix */
834         loTriFactor->csrMat = new CsrMatrix;
835         loTriFactor->csrMat->num_rows = A->rmap->n;
836         loTriFactor->csrMat->num_cols = A->cmap->n;
837         loTriFactor->csrMat->num_entries = a->nz;
838 
839         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847 
848         /* Create the solve analysis information */
849         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858       #endif
859 
860         /* perform the solve analysis */
861         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867                                 #endif
868 );CHKERRCUSPARSE(stat);
869         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871 
872         /* assign the pointer */
873         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874 
875         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
876         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878       } else {
879         /* Fill the upper triangular matrix */
880         offset = 0;
881         for (i=0; i<n; i++) {
882           /* set the pointers */
883           v  = aa + ai[i];
884           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885 
886           /* first, set the diagonal elements */
887           AAUp[offset] = 1.0/v[nz];
888           AALo[offset] = 1.0/v[nz];
889 
890           offset+=1;
891           if (nz>0) {
892             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893             for (j=offset; j<offset+nz; j++) {
894               AAUp[j] = -AAUp[j];
895               AALo[j] = AAUp[j]/v[nz];
896             }
897             offset+=nz;
898           }
899         }
900         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905       }
906       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908     } catch(char *ex) {
909       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910     }
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916 {
917   PetscErrorCode               ierr;
918   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920   IS                           ip = a->row;
921   PetscBool                    perm_identity;
922   PetscInt                     n = A->rmap->n;
923 
924   PetscFunctionBegin;
925   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929 
930   A->offloadmask = PETSC_OFFLOAD_BOTH;
931 
932   /* lower triangular indices */
933   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934   if (!perm_identity) {
935     IS             iip;
936     const PetscInt *irip,*rip;
937 
938     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
939     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
946     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949   }
950   PetscFunctionReturn(0);
951 }
952 
953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954 {
955   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956   IS             ip = b->row;
957   PetscBool      perm_identity;
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
962   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
963   B->offloadmask = PETSC_OFFLOAD_CPU;
964   /* determine which version of MatSolve needs to be used. */
965   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
966   if (perm_identity) {
967     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
969     B->ops->matsolve = NULL;
970     B->ops->matsolvetranspose = NULL;
971   } else {
972     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
974     B->ops->matsolve = NULL;
975     B->ops->matsolvetranspose = NULL;
976   }
977 
978   /* get the triangular factors */
979   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984 {
985   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990   cusparseStatus_t                  stat;
991   cusparseIndexBase_t               indexBase;
992   cusparseMatrixType_t              matrixType;
993   cusparseFillMode_t                fillMode;
994   cusparseDiagType_t                diagType;
995   cudaError_t                       cerr;
996   PetscErrorCode                    ierr;
997 
998   PetscFunctionBegin;
999   /* allocate space for the transpose of the lower triangular factor */
1000   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1001   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002 
1003   /* set the matrix descriptors of the lower triangular factor */
1004   matrixType = cusparseGetMatType(loTriFactor->descr);
1005   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009 
1010   /* Create the matrix description */
1011   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1013   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1014   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1015   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1016 
1017   /* set the operation */
1018   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019 
1020   /* allocate GPU space for the CSC of the lower triangular factor*/
1021   loTriFactorT->csrMat = new CsrMatrix;
1022   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028 
1029   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033                                        loTriFactor->csrMat->values->data().get(),
1034                                        loTriFactor->csrMat->row_offsets->data().get(),
1035                                        loTriFactor->csrMat->column_indices->data().get(),
1036                                        loTriFactorT->csrMat->values->data().get(),
1037                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1040   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041 #endif
1042 
1043   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1044   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046                           loTriFactor->csrMat->values->data().get(),
1047                           loTriFactor->csrMat->row_offsets->data().get(),
1048                           loTriFactor->csrMat->column_indices->data().get(),
1049                           loTriFactorT->csrMat->values->data().get(),
1050                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054                         #else
1055                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056                           CUSPARSE_ACTION_NUMERIC, indexBase
1057                         #endif
1058 );CHKERRCUSPARSE(stat);
1059   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061 
1062   /* Create the solve analysis information */
1063   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1064   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072 #endif
1073 
1074   /* perform the solve analysis */
1075   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1079                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081                           #endif
1082 );CHKERRCUSPARSE(stat);
1083   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1085 
1086   /* assign the pointer */
1087   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1088 
1089   /*********************************************/
1090   /* Now the Transpose of the Upper Tri Factor */
1091   /*********************************************/
1092 
1093   /* allocate space for the transpose of the upper triangular factor */
1094   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1095   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1096 
1097   /* set the matrix descriptors of the upper triangular factor */
1098   matrixType = cusparseGetMatType(upTriFactor->descr);
1099   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1103 
1104   /* Create the matrix description */
1105   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1106   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1107   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1109   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1110 
1111   /* set the operation */
1112   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1113 
1114   /* allocate GPU space for the CSC of the upper triangular factor*/
1115   upTriFactorT->csrMat = new CsrMatrix;
1116   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1122 
1123   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127                                 upTriFactor->csrMat->values->data().get(),
1128                                 upTriFactor->csrMat->row_offsets->data().get(),
1129                                 upTriFactor->csrMat->column_indices->data().get(),
1130                                 upTriFactorT->csrMat->values->data().get(),
1131                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135 #endif
1136 
1137   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1138   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140                           upTriFactor->csrMat->values->data().get(),
1141                           upTriFactor->csrMat->row_offsets->data().get(),
1142                           upTriFactor->csrMat->column_indices->data().get(),
1143                           upTriFactorT->csrMat->values->data().get(),
1144                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148                         #else
1149                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150                           CUSPARSE_ACTION_NUMERIC, indexBase
1151                         #endif
1152 );CHKERRCUSPARSE(stat);
1153   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155 
1156   /* Create the solve analysis information */
1157   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1159   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166   #endif
1167 
1168   /* perform the solve analysis */
1169   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1173                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175                           #endif
1176 );CHKERRCUSPARSE(stat);
1177   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1179 
1180   /* assign the pointer */
1181   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 struct PetscScalarToPetscInt
1186 {
1187   __host__ __device__
1188   PetscInt operator()(PetscScalar s)
1189   {
1190     return (PetscInt)PetscRealPart(s);
1191   }
1192 };
1193 
1194 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1195 {
1196   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1197   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1198   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1199   cusparseStatus_t             stat;
1200   cusparseIndexBase_t          indexBase;
1201   cudaError_t                  err;
1202   PetscErrorCode               ierr;
1203 
1204   PetscFunctionBegin;
1205   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1206   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1207   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1208   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1209   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1210   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1211   if (A->transupdated) PetscFunctionReturn(0);
1212   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1213   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1214   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1215     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1216   }
1217   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1218     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1219     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1220     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1221     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1222     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1223 
1224     /* set alpha and beta */
1225     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1226     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1227     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1228     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1231 
1232     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1233       CsrMatrix *matrixT = new CsrMatrix;
1234       matstructT->mat = matrixT;
1235       matrixT->num_rows = A->cmap->n;
1236       matrixT->num_cols = A->rmap->n;
1237       matrixT->num_entries = a->nz;
1238       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1239       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1240       matrixT->values = new THRUSTARRAY(a->nz);
1241 
1242       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1243       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1244 
1245      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1246       stat = cusparseCreateCsr(&matstructT->matDescr,
1247                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1248                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1249                                matrixT->values->data().get(),
1250                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1251                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1252      #endif
1253     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1254    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1255       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1256    #else
1257       CsrMatrix *temp  = new CsrMatrix;
1258       CsrMatrix *tempT = new CsrMatrix;
1259       /* First convert HYB to CSR */
1260       temp->num_rows = A->rmap->n;
1261       temp->num_cols = A->cmap->n;
1262       temp->num_entries = a->nz;
1263       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1264       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1265       temp->values = new THRUSTARRAY(a->nz);
1266 
1267       stat = cusparse_hyb2csr(cusparsestruct->handle,
1268                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1269                               temp->values->data().get(),
1270                               temp->row_offsets->data().get(),
1271                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1272 
1273       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1274       tempT->num_rows = A->rmap->n;
1275       tempT->num_cols = A->cmap->n;
1276       tempT->num_entries = a->nz;
1277       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1279       tempT->values = new THRUSTARRAY(a->nz);
1280 
1281       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1282                               temp->num_cols, temp->num_entries,
1283                               temp->values->data().get(),
1284                               temp->row_offsets->data().get(),
1285                               temp->column_indices->data().get(),
1286                               tempT->values->data().get(),
1287                               tempT->column_indices->data().get(),
1288                               tempT->row_offsets->data().get(),
1289                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1290 
1291       /* Last, convert CSC to HYB */
1292       cusparseHybMat_t hybMat;
1293       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1294       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1295         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1296       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1297                               matstructT->descr, tempT->values->data().get(),
1298                               tempT->row_offsets->data().get(),
1299                               tempT->column_indices->data().get(),
1300                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1301 
1302       /* assign the pointer */
1303       matstructT->mat = hybMat;
1304       A->transupdated = PETSC_TRUE;
1305       /* delete temporaries */
1306       if (tempT) {
1307         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1308         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1309         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1310         delete (CsrMatrix*) tempT;
1311       }
1312       if (temp) {
1313         if (temp->values) delete (THRUSTARRAY*) temp->values;
1314         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1315         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1316         delete (CsrMatrix*) temp;
1317       }
1318      #endif
1319     }
1320   }
1321   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1322     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1323     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1324     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1325     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1326     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1327     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1328     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1329     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1330     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1331     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1332     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1333       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1334       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1335       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1336     }
1337     if (!cusparsestruct->csr2csc_i) {
1338       THRUSTARRAY csr2csc_a(matrix->num_entries);
1339       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1340 
1341       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1342      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1343       void   *csr2cscBuffer;
1344       size_t csr2cscBufferSize;
1345       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1346                                            A->cmap->n, matrix->num_entries,
1347                                            matrix->values->data().get(),
1348                                            cusparsestruct->rowoffsets_gpu->data().get(),
1349                                            matrix->column_indices->data().get(),
1350                                            matrixT->values->data().get(),
1351                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1352                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1353                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1354       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1355      #endif
1356 
1357       if (matrix->num_entries) {
1358         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1359            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1360            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1361 
1362            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1363            should be filled with indexBase. So I just take a shortcut here.
1364         */
1365         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1366                               A->cmap->n,matrix->num_entries,
1367                               csr2csc_a.data().get(),
1368                               cusparsestruct->rowoffsets_gpu->data().get(),
1369                               matrix->column_indices->data().get(),
1370                               matrixT->values->data().get(),
1371                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1372                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1373                               CUSPARSE_ACTION_NUMERIC,indexBase,
1374                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1375                              #else
1376                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1377                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1378                              #endif
1379       } else {
1380         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1381       }
1382 
1383       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1384       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1385      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1386       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1387      #endif
1388     }
1389     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1390                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1391                                                      matrixT->values->begin()));
1392   }
1393   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1394   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1395   /* the compressed row indices is not used for matTranspose */
1396   matstructT->cprowIndices = NULL;
1397   /* assign the pointer */
1398   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1399   A->transupdated = PETSC_TRUE;
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1404 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1405 {
1406   PetscInt                              n = xx->map->n;
1407   const PetscScalar                     *barray;
1408   PetscScalar                           *xarray;
1409   thrust::device_ptr<const PetscScalar> bGPU;
1410   thrust::device_ptr<PetscScalar>       xGPU;
1411   cusparseStatus_t                      stat;
1412   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416   PetscErrorCode                        ierr;
1417 
1418   PetscFunctionBegin;
1419   /* Analyze the matrix and create the transpose ... on the fly */
1420   if (!loTriFactorT && !upTriFactorT) {
1421     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1422     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1423     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1424   }
1425 
1426   /* Get the GPU pointers */
1427   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1428   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1429   xGPU = thrust::device_pointer_cast(xarray);
1430   bGPU = thrust::device_pointer_cast(barray);
1431 
1432   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1433   /* First, reorder with the row permutation */
1434   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1435                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1436                xGPU);
1437 
1438   /* First, solve U */
1439   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1440                         upTriFactorT->csrMat->num_rows,
1441                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1442                         upTriFactorT->csrMat->num_entries,
1443                       #endif
1444                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1445                         upTriFactorT->csrMat->values->data().get(),
1446                         upTriFactorT->csrMat->row_offsets->data().get(),
1447                         upTriFactorT->csrMat->column_indices->data().get(),
1448                         upTriFactorT->solveInfo,
1449                         xarray, tempGPU->data().get()
1450                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1451                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1452                       #endif
1453 );CHKERRCUSPARSE(stat);
1454 
1455   /* Then, solve L */
1456   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1457                         loTriFactorT->csrMat->num_rows,
1458                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459                         loTriFactorT->csrMat->num_entries,
1460                       #endif
1461                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1462                         loTriFactorT->csrMat->values->data().get(),
1463                         loTriFactorT->csrMat->row_offsets->data().get(),
1464                         loTriFactorT->csrMat->column_indices->data().get(),
1465                         loTriFactorT->solveInfo,
1466                         tempGPU->data().get(), xarray
1467                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1469                       #endif
1470 );CHKERRCUSPARSE(stat);
1471 
1472   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1473   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1474                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1475                tempGPU->begin());
1476 
1477   /* Copy the temporary to the full solution. */
1478   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1479 
1480   /* restore */
1481   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1482   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1489 {
1490   const PetscScalar                 *barray;
1491   PetscScalar                       *xarray;
1492   cusparseStatus_t                  stat;
1493   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1494   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1495   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1496   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1497   PetscErrorCode                    ierr;
1498 
1499   PetscFunctionBegin;
1500   /* Analyze the matrix and create the transpose ... on the fly */
1501   if (!loTriFactorT && !upTriFactorT) {
1502     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1503     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1504     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1505   }
1506 
1507   /* Get the GPU pointers */
1508   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1509   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1510 
1511   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1512   /* First, solve U */
1513   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1514                         upTriFactorT->csrMat->num_rows,
1515                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1516                         upTriFactorT->csrMat->num_entries,
1517                       #endif
1518                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1519                         upTriFactorT->csrMat->values->data().get(),
1520                         upTriFactorT->csrMat->row_offsets->data().get(),
1521                         upTriFactorT->csrMat->column_indices->data().get(),
1522                         upTriFactorT->solveInfo,
1523                         barray, tempGPU->data().get()
1524                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1526                       #endif
1527 );CHKERRCUSPARSE(stat);
1528 
1529   /* Then, solve L */
1530   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1531                         loTriFactorT->csrMat->num_rows,
1532                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533                         loTriFactorT->csrMat->num_entries,
1534                       #endif
1535                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1536                         loTriFactorT->csrMat->values->data().get(),
1537                         loTriFactorT->csrMat->row_offsets->data().get(),
1538                         loTriFactorT->csrMat->column_indices->data().get(),
1539                         loTriFactorT->solveInfo,
1540                         tempGPU->data().get(), xarray
1541                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1542                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1543                       #endif
1544 );CHKERRCUSPARSE(stat);
1545 
1546   /* restore */
1547   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1548   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1549   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1550   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1551   PetscFunctionReturn(0);
1552 }
1553 
1554 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1555 {
1556   const PetscScalar                     *barray;
1557   PetscScalar                           *xarray;
1558   thrust::device_ptr<const PetscScalar> bGPU;
1559   thrust::device_ptr<PetscScalar>       xGPU;
1560   cusparseStatus_t                      stat;
1561   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1562   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1563   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1564   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1565   PetscErrorCode                        ierr;
1566 
1567   PetscFunctionBegin;
1568 
1569   /* Get the GPU pointers */
1570   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1571   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1572   xGPU = thrust::device_pointer_cast(xarray);
1573   bGPU = thrust::device_pointer_cast(barray);
1574 
1575   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1576   /* First, reorder with the row permutation */
1577   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1578                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1579                tempGPU->begin());
1580 
1581   /* Next, solve L */
1582   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1583                         loTriFactor->csrMat->num_rows,
1584                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1585                         loTriFactor->csrMat->num_entries,
1586                       #endif
1587                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1588                         loTriFactor->csrMat->values->data().get(),
1589                         loTriFactor->csrMat->row_offsets->data().get(),
1590                         loTriFactor->csrMat->column_indices->data().get(),
1591                         loTriFactor->solveInfo,
1592                         tempGPU->data().get(), xarray
1593                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1594                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1595                       #endif
1596 );CHKERRCUSPARSE(stat);
1597 
1598   /* Then, solve U */
1599   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1600                         upTriFactor->csrMat->num_rows,
1601                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602                         upTriFactor->csrMat->num_entries,
1603                       #endif
1604                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1605                         upTriFactor->csrMat->values->data().get(),
1606                         upTriFactor->csrMat->row_offsets->data().get(),
1607                         upTriFactor->csrMat->column_indices->data().get(),
1608                         upTriFactor->solveInfo,
1609                         xarray, tempGPU->data().get()
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1612                       #endif
1613 );CHKERRCUSPARSE(stat);
1614 
1615   /* Last, reorder with the column permutation */
1616   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1617                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1618                xGPU);
1619 
1620   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1621   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1622   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1623   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1624   PetscFunctionReturn(0);
1625 }
1626 
1627 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1628 {
1629   const PetscScalar                 *barray;
1630   PetscScalar                       *xarray;
1631   cusparseStatus_t                  stat;
1632   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1633   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1634   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1635   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1636   PetscErrorCode                    ierr;
1637 
1638   PetscFunctionBegin;
1639   /* Get the GPU pointers */
1640   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1641   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1642 
1643   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1644   /* First, solve L */
1645   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1646                         loTriFactor->csrMat->num_rows,
1647                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1648                         loTriFactor->csrMat->num_entries,
1649                       #endif
1650                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1651                         loTriFactor->csrMat->values->data().get(),
1652                         loTriFactor->csrMat->row_offsets->data().get(),
1653                         loTriFactor->csrMat->column_indices->data().get(),
1654                         loTriFactor->solveInfo,
1655                         barray, tempGPU->data().get()
1656                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1657                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1658                       #endif
1659 );CHKERRCUSPARSE(stat);
1660 
1661   /* Next, solve U */
1662   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1663                         upTriFactor->csrMat->num_rows,
1664                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1665                         upTriFactor->csrMat->num_entries,
1666                       #endif
1667                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1668                         upTriFactor->csrMat->values->data().get(),
1669                         upTriFactor->csrMat->row_offsets->data().get(),
1670                         upTriFactor->csrMat->column_indices->data().get(),
1671                         upTriFactor->solveInfo,
1672                         tempGPU->data().get(), xarray
1673                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1674                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1675                       #endif
1676 );CHKERRCUSPARSE(stat);
1677 
1678   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1679   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1680   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1681   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1682   PetscFunctionReturn(0);
1683 }
1684 
1685 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1686 {
1687   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1688   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1689   cudaError_t        cerr;
1690   PetscErrorCode     ierr;
1691 
1692   PetscFunctionBegin;
1693   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1694     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1695 
1696     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1697     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1698     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1699     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1700     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1701     A->offloadmask = PETSC_OFFLOAD_BOTH;
1702   }
1703   PetscFunctionReturn(0);
1704 }
1705 
1706 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1707 {
1708   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1709   PetscErrorCode ierr;
1710 
1711   PetscFunctionBegin;
1712   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1713   *array = a->a;
1714   A->offloadmask = PETSC_OFFLOAD_CPU;
1715   PetscFunctionReturn(0);
1716 }
1717 
1718 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1719 {
1720   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1721   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1722   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1723   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1724   PetscErrorCode               ierr;
1725   cusparseStatus_t             stat;
1726   PetscBool                    both = PETSC_TRUE;
1727   cudaError_t                  err;
1728 
1729   PetscFunctionBegin;
1730   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1731   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1732     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1733       CsrMatrix *matrix;
1734       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1735 
1736       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1737       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1738       matrix->values->assign(a->a, a->a+a->nz);
1739       err  = WaitForCUDA();CHKERRCUDA(err);
1740       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1741       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1742       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1743     } else {
1744       PetscInt nnz;
1745       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1746       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1747       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1748       delete cusparsestruct->workVector;
1749       delete cusparsestruct->rowoffsets_gpu;
1750       cusparsestruct->workVector = NULL;
1751       cusparsestruct->rowoffsets_gpu = NULL;
1752       try {
1753         if (a->compressedrow.use) {
1754           m    = a->compressedrow.nrows;
1755           ii   = a->compressedrow.i;
1756           ridx = a->compressedrow.rindex;
1757         } else {
1758           m    = A->rmap->n;
1759           ii   = a->i;
1760           ridx = NULL;
1761         }
1762         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1763         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1764         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1765         else nnz = a->nz;
1766 
1767         /* create cusparse matrix */
1768         cusparsestruct->nrows = m;
1769         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1770         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1771         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1772         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1773 
1774         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1775         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1776         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1777         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1778         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1779         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1780         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1781 
1782         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1783         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1784           /* set the matrix */
1785           CsrMatrix *mat= new CsrMatrix;
1786           mat->num_rows = m;
1787           mat->num_cols = A->cmap->n;
1788           mat->num_entries = nnz;
1789           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790           mat->row_offsets->assign(ii, ii + m+1);
1791 
1792           mat->column_indices = new THRUSTINTARRAY32(nnz);
1793           mat->column_indices->assign(a->j, a->j+nnz);
1794 
1795           mat->values = new THRUSTARRAY(nnz);
1796           if (a->a) mat->values->assign(a->a, a->a+nnz);
1797 
1798           /* assign the pointer */
1799           matstruct->mat = mat;
1800          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1801           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1802             stat = cusparseCreateCsr(&matstruct->matDescr,
1803                                     mat->num_rows, mat->num_cols, mat->num_entries,
1804                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1805                                     mat->values->data().get(),
1806                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1807                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1808           }
1809          #endif
1810         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1811          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1812           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1813          #else
1814           CsrMatrix *mat= new CsrMatrix;
1815           mat->num_rows = m;
1816           mat->num_cols = A->cmap->n;
1817           mat->num_entries = nnz;
1818           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819           mat->row_offsets->assign(ii, ii + m+1);
1820 
1821           mat->column_indices = new THRUSTINTARRAY32(nnz);
1822           mat->column_indices->assign(a->j, a->j+nnz);
1823 
1824           mat->values = new THRUSTARRAY(nnz);
1825           if (a->a) mat->values->assign(a->a, a->a+nnz);
1826 
1827           cusparseHybMat_t hybMat;
1828           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1829           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1830             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1831           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1832               matstruct->descr, mat->values->data().get(),
1833               mat->row_offsets->data().get(),
1834               mat->column_indices->data().get(),
1835               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1836           /* assign the pointer */
1837           matstruct->mat = hybMat;
1838 
1839           if (mat) {
1840             if (mat->values) delete (THRUSTARRAY*)mat->values;
1841             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1842             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1843             delete (CsrMatrix*)mat;
1844           }
1845          #endif
1846         }
1847 
1848         /* assign the compressed row indices */
1849         if (a->compressedrow.use) {
1850           cusparsestruct->workVector = new THRUSTARRAY(m);
1851           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1852           matstruct->cprowIndices->assign(ridx,ridx+m);
1853           tmp = m;
1854         } else {
1855           cusparsestruct->workVector = NULL;
1856           matstruct->cprowIndices    = NULL;
1857           tmp = 0;
1858         }
1859         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1860 
1861         /* assign the pointer */
1862         cusparsestruct->mat = matstruct;
1863       } catch(char *ex) {
1864         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1865       }
1866       err  = WaitForCUDA();CHKERRCUDA(err);
1867       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1868       cusparsestruct->nonzerostate = A->nonzerostate;
1869     }
1870     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1871   }
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 struct VecCUDAPlusEquals
1876 {
1877   template <typename Tuple>
1878   __host__ __device__
1879   void operator()(Tuple t)
1880   {
1881     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1882   }
1883 };
1884 
1885 struct VecCUDAEquals
1886 {
1887   template <typename Tuple>
1888   __host__ __device__
1889   void operator()(Tuple t)
1890   {
1891     thrust::get<1>(t) = thrust::get<0>(t);
1892   }
1893 };
1894 
1895 struct VecCUDAEqualsReverse
1896 {
1897   template <typename Tuple>
1898   __host__ __device__
1899   void operator()(Tuple t)
1900   {
1901     thrust::get<0>(t) = thrust::get<1>(t);
1902   }
1903 };
1904 
1905 struct MatMatCusparse {
1906   PetscBool             cisdense;
1907   PetscScalar           *Bt;
1908   Mat                   X;
1909   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1910   PetscLogDouble        flops;
1911   CsrMatrix             *Bcsr;
1912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1913   cusparseSpMatDescr_t  matSpBDescr;
1914   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1915   cusparseDnMatDescr_t  matBDescr;
1916   cusparseDnMatDescr_t  matCDescr;
1917   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1918   size_t                mmBufferSize;
1919   void                  *mmBuffer;
1920   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1921   cusparseSpGEMMDescr_t spgemmDesc;
1922 #endif
1923 };
1924 
1925 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1926 {
1927   PetscErrorCode   ierr;
1928   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1929   cudaError_t      cerr;
1930  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1931   cusparseStatus_t stat;
1932  #endif
1933 
1934   PetscFunctionBegin;
1935   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1936   delete mmdata->Bcsr;
1937  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1939   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1940   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1941   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1942   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1943   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1944  #endif
1945   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1946   ierr = PetscFree(data);CHKERRQ(ierr);
1947   PetscFunctionReturn(0);
1948 }
1949 
1950 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1951 
1952 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1953 {
1954   Mat_Product                  *product = C->product;
1955   Mat                          A,B;
1956   PetscInt                     m,n,blda,clda;
1957   PetscBool                    flg,biscuda;
1958   Mat_SeqAIJCUSPARSE           *cusp;
1959   cusparseStatus_t             stat;
1960   cusparseOperation_t          opA;
1961   const PetscScalar            *barray;
1962   PetscScalar                  *carray;
1963   PetscErrorCode               ierr;
1964   MatMatCusparse               *mmdata;
1965   Mat_SeqAIJCUSPARSEMultStruct *mat;
1966   CsrMatrix                    *csrmat;
1967 
1968   PetscFunctionBegin;
1969   MatCheckProduct(C,1);
1970   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1971   mmdata = (MatMatCusparse*)product->data;
1972   A    = product->A;
1973   B    = product->B;
1974   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1975   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
1976   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1977      Instead of silently accepting the wrong answer, I prefer to raise the error */
1978   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1979   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1980   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1981   switch (product->type) {
1982   case MATPRODUCT_AB:
1983   case MATPRODUCT_PtAP:
1984     mat = cusp->mat;
1985     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1986     m   = A->rmap->n;
1987     n   = B->cmap->n;
1988     break;
1989   case MATPRODUCT_AtB:
1990     if (!A->form_explicit_transpose) {
1991       mat = cusp->mat;
1992       opA = CUSPARSE_OPERATION_TRANSPOSE;
1993     } else {
1994       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
1995       mat  = cusp->matTranspose;
1996       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1997     }
1998     m = A->cmap->n;
1999     n = B->cmap->n;
2000     break;
2001   case MATPRODUCT_ABt:
2002   case MATPRODUCT_RARt:
2003     mat = cusp->mat;
2004     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2005     m   = A->rmap->n;
2006     n   = B->rmap->n;
2007     break;
2008   default:
2009     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2010   }
2011   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2012   csrmat = (CsrMatrix*)mat->mat;
2013   /* if the user passed a CPU matrix, copy the data to the GPU */
2014   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2015   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2016   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2017 
2018   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2019   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2020     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2021     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2022   } else {
2023     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2024     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2025   }
2026 
2027   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2028  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2029   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2030   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2031   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2032     size_t mmBufferSize;
2033     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2034     if (!mmdata->matBDescr) {
2035       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2036       mmdata->Blda = blda;
2037     }
2038 
2039     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2040     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2041       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2042       mmdata->Clda = clda;
2043     }
2044 
2045     if (!mat->matDescr) {
2046       stat = cusparseCreateCsr(&mat->matDescr,
2047                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2048                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2049                                csrmat->values->data().get(),
2050                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2051                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2052     }
2053     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2054                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2055                                    mmdata->matCDescr,cusparse_scalartype,
2056                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2057     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2058       cudaError_t cerr;
2059       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2060       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2061       mmdata->mmBufferSize = mmBufferSize;
2062     }
2063     mmdata->initialized = PETSC_TRUE;
2064   } else {
2065     /* to be safe, always update pointers of the mats */
2066     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2067     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2068     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2069   }
2070 
2071   /* do cusparseSpMM, which supports transpose on B */
2072   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2073                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2074                       mmdata->matCDescr,cusparse_scalartype,
2075                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2076  #else
2077   PetscInt k;
2078   /* cusparseXcsrmm does not support transpose on B */
2079   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2080     cublasHandle_t cublasv2handle;
2081     cublasStatus_t cerr;
2082 
2083     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2084     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2085                        B->cmap->n,B->rmap->n,
2086                        &PETSC_CUSPARSE_ONE ,barray,blda,
2087                        &PETSC_CUSPARSE_ZERO,barray,blda,
2088                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2089     blda = B->cmap->n;
2090     k    = B->cmap->n;
2091   } else {
2092     k    = B->rmap->n;
2093   }
2094 
2095   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2096   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2097                            csrmat->num_entries,mat->alpha_one,mat->descr,
2098                            csrmat->values->data().get(),
2099                            csrmat->row_offsets->data().get(),
2100                            csrmat->column_indices->data().get(),
2101                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2102                            carray,clda);CHKERRCUSPARSE(stat);
2103  #endif
2104   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2105   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2106   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2107   if (product->type == MATPRODUCT_RARt) {
2108     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2109     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2110   } else if (product->type == MATPRODUCT_PtAP) {
2111     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2112     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2113   } else {
2114     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2115   }
2116   if (mmdata->cisdense) {
2117     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2118   }
2119   if (!biscuda) {
2120     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2121   }
2122   PetscFunctionReturn(0);
2123 }
2124 
2125 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2126 {
2127   Mat_Product        *product = C->product;
2128   Mat                A,B;
2129   PetscInt           m,n;
2130   PetscBool          cisdense,flg;
2131   PetscErrorCode     ierr;
2132   MatMatCusparse     *mmdata;
2133   Mat_SeqAIJCUSPARSE *cusp;
2134 
2135   PetscFunctionBegin;
2136   MatCheckProduct(C,1);
2137   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2138   A    = product->A;
2139   B    = product->B;
2140   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2141   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2142   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2143   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2144   switch (product->type) {
2145   case MATPRODUCT_AB:
2146     m = A->rmap->n;
2147     n = B->cmap->n;
2148     break;
2149   case MATPRODUCT_AtB:
2150     m = A->cmap->n;
2151     n = B->cmap->n;
2152     break;
2153   case MATPRODUCT_ABt:
2154     m = A->rmap->n;
2155     n = B->rmap->n;
2156     break;
2157   case MATPRODUCT_PtAP:
2158     m = B->cmap->n;
2159     n = B->cmap->n;
2160     break;
2161   case MATPRODUCT_RARt:
2162     m = B->rmap->n;
2163     n = B->rmap->n;
2164     break;
2165   default:
2166     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2167   }
2168   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2169   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2170   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2171   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2172 
2173   /* product data */
2174   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2175   mmdata->cisdense = cisdense;
2176  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2177   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2178   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2179     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2180   }
2181  #endif
2182   /* for these products we need intermediate storage */
2183   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2184     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2185     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2186     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2187       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2188     } else {
2189       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2190     }
2191   }
2192   C->product->data    = mmdata;
2193   C->product->destroy = MatDestroy_MatMatCusparse;
2194 
2195   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2196   PetscFunctionReturn(0);
2197 }
2198 
2199 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2200 {
2201   Mat_Product                  *product = C->product;
2202   Mat                          A,B;
2203   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2204   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2205   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2206   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2207   PetscBool                    flg;
2208   PetscErrorCode               ierr;
2209   cusparseStatus_t             stat;
2210   cudaError_t                  cerr;
2211   MatProductType               ptype;
2212   MatMatCusparse               *mmdata;
2213 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2214   cusparseSpMatDescr_t         BmatSpDescr;
2215 #endif
2216 
2217   PetscFunctionBegin;
2218   MatCheckProduct(C,1);
2219   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2220   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2221   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2222   mmdata = (MatMatCusparse*)C->product->data;
2223   A = product->A;
2224   B = product->B;
2225   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2226     mmdata->reusesym = PETSC_FALSE;
2227     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2228     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2229     Cmat = Ccusp->mat;
2230     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2231     Ccsr = (CsrMatrix*)Cmat->mat;
2232     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2233     goto finalize;
2234   }
2235   if (!c->nz) goto finalize;
2236   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2237   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2238   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2239   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2240   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2241   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2242   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2243   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2244   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2245   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2246   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2247   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2248   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2249   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2250 
2251   ptype = product->type;
2252   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2253   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2254   switch (ptype) {
2255   case MATPRODUCT_AB:
2256     Amat = Acusp->mat;
2257     Bmat = Bcusp->mat;
2258     break;
2259   case MATPRODUCT_AtB:
2260     Amat = Acusp->matTranspose;
2261     Bmat = Bcusp->mat;
2262     break;
2263   case MATPRODUCT_ABt:
2264     Amat = Acusp->mat;
2265     Bmat = Bcusp->matTranspose;
2266     break;
2267   default:
2268     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2269   }
2270   Cmat = Ccusp->mat;
2271   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2272   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2273   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2274   Acsr = (CsrMatrix*)Amat->mat;
2275   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2276   Ccsr = (CsrMatrix*)Cmat->mat;
2277   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2278   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2279   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2280   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2281 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2282   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2283   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2284                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2285                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2286                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2287   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2288                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2289                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2290 #else
2291   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2292                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2293                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2294                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2295                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2296 #endif
2297   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2298   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2299   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2300   C->offloadmask = PETSC_OFFLOAD_GPU;
2301 finalize:
2302   /* shorter version of MatAssemblyEnd_SeqAIJ */
2303   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2304   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2305   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2306   c->reallocs         = 0;
2307   C->info.mallocs    += 0;
2308   C->info.nz_unneeded = 0;
2309   C->assembled = C->was_assembled = PETSC_TRUE;
2310   C->num_ass++;
2311   PetscFunctionReturn(0);
2312 }
2313 
2314 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2315 {
2316   Mat_Product                  *product = C->product;
2317   Mat                          A,B;
2318   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2319   Mat_SeqAIJ                   *a,*b,*c;
2320   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2321   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2322   PetscInt                     i,j,m,n,k;
2323   PetscBool                    flg;
2324   PetscErrorCode               ierr;
2325   cusparseStatus_t             stat;
2326   cudaError_t                  cerr;
2327   MatProductType               ptype;
2328   MatMatCusparse               *mmdata;
2329   PetscLogDouble               flops;
2330   PetscBool                    biscompressed,ciscompressed;
2331 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2332   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2333   size_t                       bufSize2;
2334   cusparseSpMatDescr_t         BmatSpDescr;
2335 #else
2336   int                          cnz;
2337 #endif
2338 
2339   PetscFunctionBegin;
2340   MatCheckProduct(C,1);
2341   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2342   A    = product->A;
2343   B    = product->B;
2344   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2345   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2346   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2347   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2348   a = (Mat_SeqAIJ*)A->data;
2349   b = (Mat_SeqAIJ*)B->data;
2350   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2351   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2352   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2353   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2354 
2355   /* product data */
2356   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2357   C->product->data    = mmdata;
2358   C->product->destroy = MatDestroy_MatMatCusparse;
2359 
2360   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2361   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2362   ptype = product->type;
2363   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2364   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2365   biscompressed = PETSC_FALSE;
2366   ciscompressed = PETSC_FALSE;
2367   switch (ptype) {
2368   case MATPRODUCT_AB:
2369     m = A->rmap->n;
2370     n = B->cmap->n;
2371     k = A->cmap->n;
2372     Amat = Acusp->mat;
2373     Bmat = Bcusp->mat;
2374     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2375     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2376     break;
2377   case MATPRODUCT_AtB:
2378     m = A->cmap->n;
2379     n = B->cmap->n;
2380     k = A->rmap->n;
2381     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2382     Amat = Acusp->matTranspose;
2383     Bmat = Bcusp->mat;
2384     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2385     break;
2386   case MATPRODUCT_ABt:
2387     m = A->rmap->n;
2388     n = B->rmap->n;
2389     k = A->cmap->n;
2390     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2391     Amat = Acusp->mat;
2392     Bmat = Bcusp->matTranspose;
2393     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2394     break;
2395   default:
2396     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2397   }
2398 
2399   /* create cusparse matrix */
2400   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2401   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2402   c     = (Mat_SeqAIJ*)C->data;
2403   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2404   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2405   Ccsr  = new CsrMatrix;
2406 
2407   c->compressedrow.use = ciscompressed;
2408   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2409     c->compressedrow.nrows = a->compressedrow.nrows;
2410     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2411     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2412     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2413     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2414     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2415   } else {
2416     c->compressedrow.nrows  = 0;
2417     c->compressedrow.i      = NULL;
2418     c->compressedrow.rindex = NULL;
2419     Ccusp->workVector       = NULL;
2420     Cmat->cprowIndices      = NULL;
2421   }
2422   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2423   Ccusp->mat      = Cmat;
2424   Ccusp->mat->mat = Ccsr;
2425   Ccsr->num_rows    = Ccusp->nrows;
2426   Ccsr->num_cols    = n;
2427   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2428   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2429   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2430   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2431   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2432   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2433   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2434   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2435   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2436   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2437   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2438     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2439     c->nz = 0;
2440     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2441     Ccsr->values = new THRUSTARRAY(c->nz);
2442     goto finalizesym;
2443   }
2444 
2445   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2446   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2447   Acsr = (CsrMatrix*)Amat->mat;
2448   if (!biscompressed) {
2449     Bcsr = (CsrMatrix*)Bmat->mat;
2450 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2451     BmatSpDescr = Bmat->matDescr;
2452 #endif
2453   } else { /* we need to use row offsets for the full matrix */
2454     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2455     Bcsr = new CsrMatrix;
2456     Bcsr->num_rows       = B->rmap->n;
2457     Bcsr->num_cols       = cBcsr->num_cols;
2458     Bcsr->num_entries    = cBcsr->num_entries;
2459     Bcsr->column_indices = cBcsr->column_indices;
2460     Bcsr->values         = cBcsr->values;
2461     if (!Bcusp->rowoffsets_gpu) {
2462       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2463       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2464       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2465     }
2466     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2467     mmdata->Bcsr = Bcsr;
2468 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2469     if (Bcsr->num_rows && Bcsr->num_cols) {
2470       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2471                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2472                                Bcsr->values->data().get(),
2473                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2474                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2475     }
2476     BmatSpDescr = mmdata->matSpBDescr;
2477 #endif
2478   }
2479   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2480   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2481   /* precompute flops count */
2482   if (ptype == MATPRODUCT_AB) {
2483     for (i=0, flops = 0; i<A->rmap->n; i++) {
2484       const PetscInt st = a->i[i];
2485       const PetscInt en = a->i[i+1];
2486       for (j=st; j<en; j++) {
2487         const PetscInt brow = a->j[j];
2488         flops += 2.*(b->i[brow+1] - b->i[brow]);
2489       }
2490     }
2491   } else if (ptype == MATPRODUCT_AtB) {
2492     for (i=0, flops = 0; i<A->rmap->n; i++) {
2493       const PetscInt anzi = a->i[i+1] - a->i[i];
2494       const PetscInt bnzi = b->i[i+1] - b->i[i];
2495       flops += (2.*anzi)*bnzi;
2496     }
2497   } else { /* TODO */
2498     flops = 0.;
2499   }
2500 
2501   mmdata->flops = flops;
2502   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2503 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2504   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2505   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2506                            NULL, NULL, NULL,
2507                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2508                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2509   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2510   /* ask bufferSize bytes for external memory */
2511   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2512                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2513                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2514                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2515   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2516   /* inspect the matrices A and B to understand the memory requirement for the next step */
2517   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2518                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2519                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2520                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2521   /* ask bufferSize again bytes for external memory */
2522   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2523                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2524                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2525                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2526   /* The CUSPARSE documentation is not clear, nor the API
2527      We need both buffers to perform the operations properly!
2528      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2529      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2530      is stored in the descriptor! What a messy API... */
2531   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2532   /* compute the intermediate product of A * B */
2533   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2534                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2535                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2536                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2537   /* get matrix C non-zero entries C_nnz1 */
2538   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2539   c->nz = (PetscInt) C_nnz1;
2540   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2541   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2542   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2543   Ccsr->values = new THRUSTARRAY(c->nz);
2544   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2546                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2547   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2548                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2549                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2550 #else
2551   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2552   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2553                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2554                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2555                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2556                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2557   c->nz = cnz;
2558   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2559   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2560   Ccsr->values = new THRUSTARRAY(c->nz);
2561   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2562 
2563   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2564   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2565      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2566      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2567   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2568                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2569                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2570                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2571                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2572 #endif
2573   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2574   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2575 finalizesym:
2576   c->singlemalloc = PETSC_FALSE;
2577   c->free_a       = PETSC_TRUE;
2578   c->free_ij      = PETSC_TRUE;
2579   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2580   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2581   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2582     PetscInt *d_i = c->i;
2583     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2584     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2585     ii   = *Ccsr->row_offsets;
2586     jj   = *Ccsr->column_indices;
2587     if (ciscompressed) d_i = c->compressedrow.i;
2588     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2589     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2590   } else {
2591     PetscInt *d_i = c->i;
2592     if (ciscompressed) d_i = c->compressedrow.i;
2593     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2594     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2595   }
2596   if (ciscompressed) { /* need to expand host row offsets */
2597     PetscInt r = 0;
2598     c->i[0] = 0;
2599     for (k = 0; k < c->compressedrow.nrows; k++) {
2600       const PetscInt next = c->compressedrow.rindex[k];
2601       const PetscInt old = c->compressedrow.i[k];
2602       for (; r < next; r++) c->i[r+1] = old;
2603     }
2604     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2605   }
2606   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2607   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2608   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2609   c->maxnz = c->nz;
2610   c->nonzerorowcnt = 0;
2611   c->rmax = 0;
2612   for (k = 0; k < m; k++) {
2613     const PetscInt nn = c->i[k+1] - c->i[k];
2614     c->ilen[k] = c->imax[k] = nn;
2615     c->nonzerorowcnt += (PetscInt)!!nn;
2616     c->rmax = PetscMax(c->rmax,nn);
2617   }
2618   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2619   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2620   Ccsr->num_entries = c->nz;
2621 
2622   C->nonzerostate++;
2623   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2624   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2625   Ccusp->nonzerostate = C->nonzerostate;
2626   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2627   C->preallocated  = PETSC_TRUE;
2628   C->assembled     = PETSC_FALSE;
2629   C->was_assembled = PETSC_FALSE;
2630   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2631     mmdata->reusesym = PETSC_TRUE;
2632     C->offloadmask   = PETSC_OFFLOAD_GPU;
2633   }
2634   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2635   PetscFunctionReturn(0);
2636 }
2637 
2638 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2639 
2640 /* handles sparse or dense B */
2641 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2642 {
2643   Mat_Product    *product = mat->product;
2644   PetscErrorCode ierr;
2645   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2646 
2647   PetscFunctionBegin;
2648   MatCheckProduct(mat,1);
2649   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2650   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2651     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2652   }
2653   if (product->type == MATPRODUCT_ABC) {
2654     Ciscusp = PETSC_FALSE;
2655     if (!product->C->boundtocpu) {
2656       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2657     }
2658   }
2659   if (isdense) {
2660     switch (product->type) {
2661     case MATPRODUCT_AB:
2662     case MATPRODUCT_AtB:
2663     case MATPRODUCT_ABt:
2664     case MATPRODUCT_PtAP:
2665     case MATPRODUCT_RARt:
2666      if (product->A->boundtocpu) {
2667         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2668       } else {
2669         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2670       }
2671       break;
2672     case MATPRODUCT_ABC:
2673       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2674       break;
2675     default:
2676       break;
2677     }
2678   } else if (Biscusp && Ciscusp) {
2679     switch (product->type) {
2680     case MATPRODUCT_AB:
2681     case MATPRODUCT_AtB:
2682     case MATPRODUCT_ABt:
2683       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2684       break;
2685     case MATPRODUCT_PtAP:
2686     case MATPRODUCT_RARt:
2687     case MATPRODUCT_ABC:
2688       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2689       break;
2690     default:
2691       break;
2692     }
2693   } else { /* fallback for AIJ */
2694     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2695   }
2696   PetscFunctionReturn(0);
2697 }
2698 
2699 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2700 {
2701   PetscErrorCode ierr;
2702 
2703   PetscFunctionBegin;
2704   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2705   PetscFunctionReturn(0);
2706 }
2707 
2708 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2709 {
2710   PetscErrorCode ierr;
2711 
2712   PetscFunctionBegin;
2713   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2714   PetscFunctionReturn(0);
2715 }
2716 
2717 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2718 {
2719   PetscErrorCode ierr;
2720 
2721   PetscFunctionBegin;
2722   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2723   PetscFunctionReturn(0);
2724 }
2725 
2726 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2727 {
2728   PetscErrorCode ierr;
2729 
2730   PetscFunctionBegin;
2731   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2732   PetscFunctionReturn(0);
2733 }
2734 
2735 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2736 {
2737   PetscErrorCode ierr;
2738 
2739   PetscFunctionBegin;
2740   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2741   PetscFunctionReturn(0);
2742 }
2743 
2744 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2745 {
2746   int i = blockIdx.x*blockDim.x + threadIdx.x;
2747   if (i < n) y[idx[i]] += x[i];
2748 }
2749 
2750 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2751 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2752 {
2753   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2754   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2755   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2756   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2757   PetscErrorCode               ierr;
2758   cusparseStatus_t             stat;
2759   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2760   PetscBool                    compressed;
2761 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2762   PetscInt                     nx,ny;
2763 #endif
2764 
2765   PetscFunctionBegin;
2766   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2767   if (!a->nonzerorowcnt) {
2768     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2769     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2770     PetscFunctionReturn(0);
2771   }
2772   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2773   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2774   if (!trans) {
2775     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2776     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2777   } else {
2778     if (herm || !A->form_explicit_transpose) {
2779       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2780       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2781     } else {
2782       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2783       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2784     }
2785   }
2786   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2787   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2788 
2789   try {
2790     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2791     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2792     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2793 
2794     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2795     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2796       /* z = A x + beta y.
2797          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2798          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2799       */
2800       xptr = xarray;
2801       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2802       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2803      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2804       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2805           allocated to accommodate different uses. So we get the length info directly from mat.
2806        */
2807       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2808         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2809         nx = mat->num_cols;
2810         ny = mat->num_rows;
2811       }
2812      #endif
2813     } else {
2814       /* z = A^T x + beta y
2815          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2816          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2817        */
2818       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2819       dptr = zarray;
2820       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2821       if (compressed) { /* Scatter x to work vector */
2822         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2823         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2824                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2825                          VecCUDAEqualsReverse());
2826       }
2827      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2828       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2829         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2830         nx = mat->num_rows;
2831         ny = mat->num_cols;
2832       }
2833      #endif
2834     }
2835 
2836     /* csr_spmv does y = alpha op(A) x + beta y */
2837     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2838      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2839       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2840       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2841         cudaError_t cerr;
2842         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2843         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2844         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2845                                 matstruct->matDescr,
2846                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2847                                 matstruct->cuSpMV[opA].vecYDescr,
2848                                 cusparse_scalartype,
2849                                 cusparsestruct->spmvAlg,
2850                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2851         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2852 
2853         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2854       } else {
2855         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2856         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2857         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2858       }
2859 
2860       stat = cusparseSpMV(cusparsestruct->handle, opA,
2861                                matstruct->alpha_one,
2862                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2863                                matstruct->cuSpMV[opA].vecXDescr,
2864                                beta,
2865                                matstruct->cuSpMV[opA].vecYDescr,
2866                                cusparse_scalartype,
2867                                cusparsestruct->spmvAlg,
2868                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2869      #else
2870       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2871       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2872                                mat->num_rows, mat->num_cols,
2873                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2874                                mat->values->data().get(), mat->row_offsets->data().get(),
2875                                mat->column_indices->data().get(), xptr, beta,
2876                                dptr);CHKERRCUSPARSE(stat);
2877      #endif
2878     } else {
2879       if (cusparsestruct->nrows) {
2880        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2881         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2882        #else
2883         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2884         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2885                                  matstruct->alpha_one, matstruct->descr, hybMat,
2886                                  xptr, beta,
2887                                  dptr);CHKERRCUSPARSE(stat);
2888        #endif
2889       }
2890     }
2891     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2892 
2893     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2894       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2895         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2896           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2897         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2898           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2899         }
2900       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2901         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2902       }
2903 
2904       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2905       if (compressed) {
2906         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2907         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2908            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2909            prevent that. So I just add a ScatterAdd kernel.
2910          */
2911        #if 0
2912         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2913         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2914                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2915                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2916                          VecCUDAPlusEquals());
2917        #else
2918         PetscInt n = matstruct->cprowIndices->size();
2919         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2920        #endif
2921         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2922       }
2923     } else {
2924       if (yy && yy != zz) {
2925         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2926       }
2927     }
2928     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2929     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2930     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2931   } catch(char *ex) {
2932     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2933   }
2934   if (yy) {
2935     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2936   } else {
2937     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2938   }
2939   PetscFunctionReturn(0);
2940 }
2941 
2942 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2943 {
2944   PetscErrorCode ierr;
2945 
2946   PetscFunctionBegin;
2947   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2948   PetscFunctionReturn(0);
2949 }
2950 
2951 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2952 {
2953   PetscErrorCode              ierr;
2954   PetscSplitCSRDataStructure  *d_mat = NULL;
2955   PetscFunctionBegin;
2956   if (A->factortype == MAT_FACTOR_NONE) {
2957     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2958   }
2959   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2960   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2961   if (d_mat) {
2962     A->offloadmask = PETSC_OFFLOAD_GPU;
2963   }
2964 
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 /* --------------------------------------------------------------------------------*/
2969 /*@
2970    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2971    (the default parallel PETSc format). This matrix will ultimately pushed down
2972    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2973    assembly performance the user should preallocate the matrix storage by setting
2974    the parameter nz (or the array nnz).  By setting these parameters accurately,
2975    performance during matrix assembly can be increased by more than a factor of 50.
2976 
2977    Collective
2978 
2979    Input Parameters:
2980 +  comm - MPI communicator, set to PETSC_COMM_SELF
2981 .  m - number of rows
2982 .  n - number of columns
2983 .  nz - number of nonzeros per row (same for all rows)
2984 -  nnz - array containing the number of nonzeros in the various rows
2985          (possibly different for each row) or NULL
2986 
2987    Output Parameter:
2988 .  A - the matrix
2989 
2990    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2991    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2992    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2993 
2994    Notes:
2995    If nnz is given then nz is ignored
2996 
2997    The AIJ format (also called the Yale sparse matrix format or
2998    compressed row storage), is fully compatible with standard Fortran 77
2999    storage.  That is, the stored row and column indices can begin at
3000    either one (as in Fortran) or zero.  See the users' manual for details.
3001 
3002    Specify the preallocated storage with either nz or nnz (not both).
3003    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3004    allocation.  For large problems you MUST preallocate memory or you
3005    will get TERRIBLE performance, see the users' manual chapter on matrices.
3006 
3007    By default, this format uses inodes (identical nodes) when possible, to
3008    improve numerical efficiency of matrix-vector products and solves. We
3009    search for consecutive rows with the same nonzero structure, thereby
3010    reusing matrix information to achieve increased efficiency.
3011 
3012    Level: intermediate
3013 
3014 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3015 @*/
3016 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3017 {
3018   PetscErrorCode ierr;
3019 
3020   PetscFunctionBegin;
3021   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3022   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3023   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3024   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3025   PetscFunctionReturn(0);
3026 }
3027 
3028 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3029 {
3030   PetscErrorCode              ierr;
3031   PetscSplitCSRDataStructure  *d_mat = NULL;
3032 
3033   PetscFunctionBegin;
3034   if (A->factortype == MAT_FACTOR_NONE) {
3035     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3036     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3037     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3038   } else {
3039     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3040   }
3041   if (d_mat) {
3042     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3043     cudaError_t                err;
3044     PetscSplitCSRDataStructure h_mat;
3045     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3046     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3047     if (a->compressedrow.use) {
3048       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3049     }
3050     err = cudaFree(d_mat);CHKERRCUDA(err);
3051   }
3052   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3053   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3054   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3055   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3056   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3057   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3058   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3059   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3060   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3061   PetscFunctionReturn(0);
3062 }
3063 
3064 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3065 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3066 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3067 {
3068   PetscErrorCode ierr;
3069 
3070   PetscFunctionBegin;
3071   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3072   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3073   PetscFunctionReturn(0);
3074 }
3075 
3076 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3077 {
3078   PetscErrorCode     ierr;
3079   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3080   Mat_SeqAIJCUSPARSE *cy;
3081   Mat_SeqAIJCUSPARSE *cx;
3082   PetscScalar        *ay;
3083   const PetscScalar  *ax;
3084   CsrMatrix          *csry,*csrx;
3085 
3086   PetscFunctionBegin;
3087   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3088   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3089   if (X->ops->axpy != Y->ops->axpy) {
3090     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3091     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3092     PetscFunctionReturn(0);
3093   }
3094   /* if we are here, it means both matrices are bound to GPU */
3095   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3096   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3097   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3098   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3099   csry = (CsrMatrix*)cy->mat->mat;
3100   csrx = (CsrMatrix*)cx->mat->mat;
3101   /* see if we can turn this into a cublas axpy */
3102   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3103     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3104     if (eq) {
3105       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3106     }
3107     if (eq) str = SAME_NONZERO_PATTERN;
3108   }
3109   /* spgeam is buggy with one column */
3110   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3111 
3112   if (str == SUBSET_NONZERO_PATTERN) {
3113     cusparseStatus_t stat;
3114     PetscScalar      b = 1.0;
3115 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3116     size_t           bufferSize;
3117     void             *buffer;
3118     cudaError_t      cerr;
3119 #endif
3120 
3121     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3122     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3123     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3125     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3126                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3127                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3128                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3129     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3130     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3131     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3132                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3133                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3134                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3135     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3136     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3137     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3138 #else
3139     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3140     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3141                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3142                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3143                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3144     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3145     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3146 #endif
3147     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3148     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3149     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3150     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3151   } else if (str == SAME_NONZERO_PATTERN) {
3152     cublasHandle_t cublasv2handle;
3153     cublasStatus_t berr;
3154     PetscBLASInt   one = 1, bnz = 1;
3155 
3156     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3157     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3158     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3159     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3160     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3161     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3162     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3163     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3164     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3165     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3166     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3167   } else {
3168     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3169     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3170   }
3171   PetscFunctionReturn(0);
3172 }
3173 
3174 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3175 {
3176   PetscErrorCode ierr;
3177   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3178   PetscScalar    *ay;
3179   cublasHandle_t cublasv2handle;
3180   cublasStatus_t berr;
3181   PetscBLASInt   one = 1, bnz = 1;
3182 
3183   PetscFunctionBegin;
3184   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3185   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3186   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3187   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3188   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3189   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3190   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3191   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3192   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3193   PetscFunctionReturn(0);
3194 }
3195 
3196 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3197 {
3198   PetscErrorCode             ierr;
3199   PetscBool                  both = PETSC_FALSE;
3200   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3201 
3202   PetscFunctionBegin;
3203   if (A->factortype == MAT_FACTOR_NONE) {
3204     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3205     if (spptr->mat) {
3206       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3207       if (matrix->values) {
3208         both = PETSC_TRUE;
3209         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3210       }
3211     }
3212     if (spptr->matTranspose) {
3213       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3214       if (matrix->values) {
3215         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3216       }
3217     }
3218   }
3219   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3220   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3221   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3222   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3223   else A->offloadmask = PETSC_OFFLOAD_CPU;
3224 
3225   PetscFunctionReturn(0);
3226 }
3227 
3228 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3229 {
3230   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3231   PetscErrorCode ierr;
3232 
3233   PetscFunctionBegin;
3234   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3235   if (flg) {
3236     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3237 
3238     A->ops->scale                     = MatScale_SeqAIJ;
3239     A->ops->axpy                      = MatAXPY_SeqAIJ;
3240     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3241     A->ops->mult                      = MatMult_SeqAIJ;
3242     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3243     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3244     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3245     A->ops->multhermitiantranspose    = NULL;
3246     A->ops->multhermitiantransposeadd = NULL;
3247     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3248     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3249     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3250     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3251     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3252     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3253     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3254     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3255   } else {
3256     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3257     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3258     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3259     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3260     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3261     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3262     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3263     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3264     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3265     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3266     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3267     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3270     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3271     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3272     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3273   }
3274   A->boundtocpu = flg;
3275   a->inode.use = flg;
3276   PetscFunctionReturn(0);
3277 }
3278 
3279 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3280 {
3281   PetscErrorCode   ierr;
3282   cusparseStatus_t stat;
3283   Mat              B;
3284 
3285   PetscFunctionBegin;
3286   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3287   if (reuse == MAT_INITIAL_MATRIX) {
3288     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3289   } else if (reuse == MAT_REUSE_MATRIX) {
3290     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3291   }
3292   B = *newmat;
3293 
3294   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3295   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3296 
3297   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3298     if (B->factortype == MAT_FACTOR_NONE) {
3299       Mat_SeqAIJCUSPARSE *spptr;
3300       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3301       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3302       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3303       spptr->format     = MAT_CUSPARSE_CSR;
3304      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3305       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3306       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3307       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3308      #endif
3309       B->spptr = spptr;
3310     } else {
3311       Mat_SeqAIJCUSPARSETriFactors *spptr;
3312 
3313       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3314       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3315       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3316       B->spptr = spptr;
3317     }
3318     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3319   }
3320   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3321   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3322   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3323   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3324   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3325   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3326 
3327   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3328   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3329   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3330   PetscFunctionReturn(0);
3331 }
3332 
3333 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3334 {
3335   PetscErrorCode ierr;
3336 
3337   PetscFunctionBegin;
3338   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3339   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3340   PetscFunctionReturn(0);
3341 }
3342 
3343 /*MC
3344    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3345 
3346    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3347    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3348    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3349 
3350    Options Database Keys:
3351 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3352 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3353 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3354 
3355   Level: beginner
3356 
3357 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3358 M*/
3359 
3360 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3361 
3362 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3363 {
3364   PetscErrorCode ierr;
3365 
3366   PetscFunctionBegin;
3367   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3368   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3369   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3370   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3371   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3372 
3373   PetscFunctionReturn(0);
3374 }
3375 
3376 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3377 {
3378   PetscErrorCode   ierr;
3379   cusparseStatus_t stat;
3380 
3381   PetscFunctionBegin;
3382   if (*cusparsestruct) {
3383     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3384     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3385     delete (*cusparsestruct)->workVector;
3386     delete (*cusparsestruct)->rowoffsets_gpu;
3387     delete (*cusparsestruct)->cooPerm;
3388     delete (*cusparsestruct)->cooPerm_a;
3389     delete (*cusparsestruct)->csr2csc_i;
3390     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3391     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3392   }
3393   PetscFunctionReturn(0);
3394 }
3395 
3396 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3397 {
3398   PetscFunctionBegin;
3399   if (*mat) {
3400     delete (*mat)->values;
3401     delete (*mat)->column_indices;
3402     delete (*mat)->row_offsets;
3403     delete *mat;
3404     *mat = 0;
3405   }
3406   PetscFunctionReturn(0);
3407 }
3408 
3409 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3410 {
3411   cusparseStatus_t stat;
3412   PetscErrorCode   ierr;
3413 
3414   PetscFunctionBegin;
3415   if (*trifactor) {
3416     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3417     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3418     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3419     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3420     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3421    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3422     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3423    #endif
3424     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3425   }
3426   PetscFunctionReturn(0);
3427 }
3428 
3429 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3430 {
3431   CsrMatrix        *mat;
3432   cusparseStatus_t stat;
3433   cudaError_t      err;
3434 
3435   PetscFunctionBegin;
3436   if (*matstruct) {
3437     if ((*matstruct)->mat) {
3438       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3439        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3440         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3441        #else
3442         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3443         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3444        #endif
3445       } else {
3446         mat = (CsrMatrix*)(*matstruct)->mat;
3447         CsrMatrix_Destroy(&mat);
3448       }
3449     }
3450     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3451     delete (*matstruct)->cprowIndices;
3452     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3453     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3454     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3455 
3456    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3457     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3458     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3459     for (int i=0; i<3; i++) {
3460       if (mdata->cuSpMV[i].initialized) {
3461         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3462         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3463         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3464       }
3465     }
3466    #endif
3467     delete *matstruct;
3468     *matstruct = NULL;
3469   }
3470   PetscFunctionReturn(0);
3471 }
3472 
3473 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3474 {
3475   PetscErrorCode ierr;
3476 
3477   PetscFunctionBegin;
3478   if (*trifactors) {
3479     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3480     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3481     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3482     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3483     delete (*trifactors)->rpermIndices;
3484     delete (*trifactors)->cpermIndices;
3485     delete (*trifactors)->workVector;
3486     (*trifactors)->rpermIndices = NULL;
3487     (*trifactors)->cpermIndices = NULL;
3488     (*trifactors)->workVector = NULL;
3489     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3490     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3491     (*trifactors)->init_dev_prop = PETSC_FALSE;
3492   }
3493   PetscFunctionReturn(0);
3494 }
3495 
3496 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3497 {
3498   PetscErrorCode   ierr;
3499   cusparseHandle_t handle;
3500   cusparseStatus_t stat;
3501 
3502   PetscFunctionBegin;
3503   if (*trifactors) {
3504     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3505     if (handle = (*trifactors)->handle) {
3506       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3507     }
3508     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3509   }
3510   PetscFunctionReturn(0);
3511 }
3512 
3513 struct IJCompare
3514 {
3515   __host__ __device__
3516   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3517   {
3518     if (t1.get<0>() < t2.get<0>()) return true;
3519     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3520     return false;
3521   }
3522 };
3523 
3524 struct IJEqual
3525 {
3526   __host__ __device__
3527   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3528   {
3529     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3530     return true;
3531   }
3532 };
3533 
3534 struct IJDiff
3535 {
3536   __host__ __device__
3537   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3538   {
3539     return t1 == t2 ? 0 : 1;
3540   }
3541 };
3542 
3543 struct IJSum
3544 {
3545   __host__ __device__
3546   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3547   {
3548     return t1||t2;
3549   }
3550 };
3551 
3552 #include <thrust/iterator/discard_iterator.h>
3553 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3554 {
3555   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3556   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3557   THRUSTARRAY                           *cooPerm_v = NULL;
3558   thrust::device_ptr<const PetscScalar> d_v;
3559   CsrMatrix                             *matrix;
3560   PetscErrorCode                        ierr;
3561   PetscInt                              n;
3562 
3563   PetscFunctionBegin;
3564   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3565   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3566   if (!cusp->cooPerm) {
3567     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3568     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3569     PetscFunctionReturn(0);
3570   }
3571   matrix = (CsrMatrix*)cusp->mat->mat;
3572   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3573   if (!v) {
3574     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3575     goto finalize;
3576   }
3577   n = cusp->cooPerm->size();
3578   if (isCudaMem(v)) {
3579     d_v = thrust::device_pointer_cast(v);
3580   } else {
3581     cooPerm_v = new THRUSTARRAY(n);
3582     cooPerm_v->assign(v,v+n);
3583     d_v = cooPerm_v->data();
3584     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3585   }
3586   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3587   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3588     if (cusp->cooPerm_a) {
3589       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3590       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3591       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3592       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3593       delete cooPerm_w;
3594     } else {
3595       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3596                                                                 matrix->values->begin()));
3597       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3598                                                                 matrix->values->end()));
3599       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3600     }
3601   } else {
3602     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3603       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3604       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3605     } else {
3606       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3607                                                                 matrix->values->begin()));
3608       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3609                                                                 matrix->values->end()));
3610       thrust::for_each(zibit,zieit,VecCUDAEquals());
3611     }
3612   }
3613   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3614 finalize:
3615   delete cooPerm_v;
3616   A->offloadmask = PETSC_OFFLOAD_GPU;
3617   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3618   /* shorter version of MatAssemblyEnd_SeqAIJ */
3619   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3620   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3621   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3622   a->reallocs         = 0;
3623   A->info.mallocs    += 0;
3624   A->info.nz_unneeded = 0;
3625   A->assembled = A->was_assembled = PETSC_TRUE;
3626   A->num_ass++;
3627   PetscFunctionReturn(0);
3628 }
3629 
3630 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3631 {
3632   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3633   PetscErrorCode     ierr;
3634 
3635   PetscFunctionBegin;
3636   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3637   if (!cusp) PetscFunctionReturn(0);
3638   if (destroy) {
3639     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3640     delete cusp->csr2csc_i;
3641     cusp->csr2csc_i = NULL;
3642   }
3643   A->transupdated = PETSC_FALSE;
3644   PetscFunctionReturn(0);
3645 }
3646 
3647 #include <thrust/binary_search.h>
3648 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3649 {
3650   PetscErrorCode     ierr;
3651   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3652   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3653   PetscInt           cooPerm_n, nzr = 0;
3654   cudaError_t        cerr;
3655 
3656   PetscFunctionBegin;
3657   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3658   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3659   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3660   if (n != cooPerm_n) {
3661     delete cusp->cooPerm;
3662     delete cusp->cooPerm_a;
3663     cusp->cooPerm = NULL;
3664     cusp->cooPerm_a = NULL;
3665   }
3666   if (n) {
3667     THRUSTINTARRAY d_i(n);
3668     THRUSTINTARRAY d_j(n);
3669     THRUSTINTARRAY ii(A->rmap->n);
3670 
3671     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3672     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3673 
3674     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3675     d_i.assign(coo_i,coo_i+n);
3676     d_j.assign(coo_j,coo_j+n);
3677     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3678     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3679 
3680     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3681     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3682     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3683     *cusp->cooPerm_a = d_i;
3684     THRUSTINTARRAY w = d_j;
3685 
3686     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3687     if (nekey == ekey) { /* all entries are unique */
3688       delete cusp->cooPerm_a;
3689       cusp->cooPerm_a = NULL;
3690     } else { /* I couldn't come up with a more elegant algorithm */
3691       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3692       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3693       (*cusp->cooPerm_a)[0] = 0;
3694       w[0] = 0;
3695       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3696       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3697     }
3698     thrust::counting_iterator<PetscInt> search_begin(0);
3699     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3700                         search_begin, search_begin + A->rmap->n,
3701                         ii.begin());
3702     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3703 
3704     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3705     a->singlemalloc = PETSC_FALSE;
3706     a->free_a       = PETSC_TRUE;
3707     a->free_ij      = PETSC_TRUE;
3708     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3709     a->i[0] = 0;
3710     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3711     a->nz = a->maxnz = a->i[A->rmap->n];
3712     a->rmax = 0;
3713     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3714     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3715     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3716     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3717     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3718     for (PetscInt i = 0; i < A->rmap->n; i++) {
3719       const PetscInt nnzr = a->i[i+1] - a->i[i];
3720       nzr += (PetscInt)!!(nnzr);
3721       a->ilen[i] = a->imax[i] = nnzr;
3722       a->rmax = PetscMax(a->rmax,nnzr);
3723     }
3724     a->nonzerorowcnt = nzr;
3725     A->preallocated = PETSC_TRUE;
3726     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3727     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3728   } else {
3729     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3730   }
3731   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3732 
3733   /* We want to allocate the CUSPARSE struct for matvec now.
3734      The code is so convoluted now that I prefer to copy zeros */
3735   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3736   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3737   A->offloadmask = PETSC_OFFLOAD_CPU;
3738   A->nonzerostate++;
3739   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3740   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3741 
3742   A->assembled = PETSC_FALSE;
3743   A->was_assembled = PETSC_FALSE;
3744   PetscFunctionReturn(0);
3745 }
3746 
3747 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3748 {
3749   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3750   CsrMatrix          *csr;
3751   PetscErrorCode     ierr;
3752 
3753   PetscFunctionBegin;
3754   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3755   PetscValidPointer(a,2);
3756   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3757   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3758   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3759   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3760   csr = (CsrMatrix*)cusp->mat->mat;
3761   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3762   *a = csr->values->data().get();
3763   PetscFunctionReturn(0);
3764 }
3765 
3766 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3767 {
3768   PetscFunctionBegin;
3769   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3770   PetscValidPointer(a,2);
3771   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3772   *a = NULL;
3773   PetscFunctionReturn(0);
3774 }
3775 
3776 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3777 {
3778   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3779   CsrMatrix          *csr;
3780   PetscErrorCode     ierr;
3781 
3782   PetscFunctionBegin;
3783   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3784   PetscValidPointer(a,2);
3785   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3786   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3787   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3788   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3789   csr = (CsrMatrix*)cusp->mat->mat;
3790   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3791   *a = csr->values->data().get();
3792   A->offloadmask = PETSC_OFFLOAD_GPU;
3793   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3794   PetscFunctionReturn(0);
3795 }
3796 
3797 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3798 {
3799   PetscErrorCode ierr;
3800 
3801   PetscFunctionBegin;
3802   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3803   PetscValidPointer(a,2);
3804   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3805   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3806   *a = NULL;
3807   PetscFunctionReturn(0);
3808 }
3809 
3810 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3811 {
3812   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3813   CsrMatrix          *csr;
3814   PetscErrorCode     ierr;
3815 
3816   PetscFunctionBegin;
3817   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3818   PetscValidPointer(a,2);
3819   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3820   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3821   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3822   csr = (CsrMatrix*)cusp->mat->mat;
3823   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3824   *a = csr->values->data().get();
3825   A->offloadmask = PETSC_OFFLOAD_GPU;
3826   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3827   PetscFunctionReturn(0);
3828 }
3829 
3830 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3831 {
3832   PetscErrorCode ierr;
3833 
3834   PetscFunctionBegin;
3835   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3836   PetscValidPointer(a,2);
3837   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3838   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3839   *a = NULL;
3840   PetscFunctionReturn(0);
3841 }
3842 
3843 struct IJCompare4
3844 {
3845   __host__ __device__
3846   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3847   {
3848     if (t1.get<0>() < t2.get<0>()) return true;
3849     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3850     return false;
3851   }
3852 };
3853 
3854 struct Shift
3855 {
3856   int _shift;
3857 
3858   Shift(int shift) : _shift(shift) {}
3859   __host__ __device__
3860   inline int operator() (const int &c)
3861   {
3862     return c + _shift;
3863   }
3864 };
3865 
3866 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3867 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3868 {
3869   PetscErrorCode               ierr;
3870   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3871   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3872   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3873   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3874   PetscInt                     Annz,Bnnz;
3875   cusparseStatus_t             stat;
3876   PetscInt                     i,m,n,zero = 0;
3877   cudaError_t                  cerr;
3878 
3879   PetscFunctionBegin;
3880   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3881   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3882   PetscValidPointer(C,4);
3883   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3884   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3885   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3886   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3887   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3888   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3889   if (reuse == MAT_INITIAL_MATRIX) {
3890     m     = A->rmap->n;
3891     n     = A->cmap->n + B->cmap->n;
3892     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3893     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3894     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3895     c     = (Mat_SeqAIJ*)(*C)->data;
3896     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3897     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3898     Ccsr  = new CsrMatrix;
3899     Cmat->cprowIndices      = NULL;
3900     c->compressedrow.use    = PETSC_FALSE;
3901     c->compressedrow.nrows  = 0;
3902     c->compressedrow.i      = NULL;
3903     c->compressedrow.rindex = NULL;
3904     Ccusp->workVector       = NULL;
3905     Ccusp->nrows    = m;
3906     Ccusp->mat      = Cmat;
3907     Ccusp->mat->mat = Ccsr;
3908     Ccsr->num_rows  = m;
3909     Ccsr->num_cols  = n;
3910     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3911     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3912     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3913     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3914     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3915     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3916     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3917     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3918     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3919     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3920     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3921     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3922     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3923     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3924     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3925 
3926     Acsr = (CsrMatrix*)Acusp->mat->mat;
3927     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3928     Annz = (PetscInt)Acsr->column_indices->size();
3929     Bnnz = (PetscInt)Bcsr->column_indices->size();
3930     c->nz = Annz + Bnnz;
3931     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3932     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3933     Ccsr->values = new THRUSTARRAY(c->nz);
3934     Ccsr->num_entries = c->nz;
3935     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3936     if (c->nz) {
3937       auto Acoo = new THRUSTINTARRAY32(Annz);
3938       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3939       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3940       THRUSTINTARRAY32 *Aroff,*Broff;
3941 
3942       if (a->compressedrow.use) { /* need full row offset */
3943         if (!Acusp->rowoffsets_gpu) {
3944           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3945           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3946           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3947         }
3948         Aroff = Acusp->rowoffsets_gpu;
3949       } else Aroff = Acsr->row_offsets;
3950       if (b->compressedrow.use) { /* need full row offset */
3951         if (!Bcusp->rowoffsets_gpu) {
3952           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3953           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3954           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3955         }
3956         Broff = Bcusp->rowoffsets_gpu;
3957       } else Broff = Bcsr->row_offsets;
3958       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3959       stat = cusparseXcsr2coo(Acusp->handle,
3960                               Aroff->data().get(),
3961                               Annz,
3962                               m,
3963                               Acoo->data().get(),
3964                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3965       stat = cusparseXcsr2coo(Bcusp->handle,
3966                               Broff->data().get(),
3967                               Bnnz,
3968                               m,
3969                               Bcoo->data().get(),
3970                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3971       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3972       auto Aperm = thrust::make_constant_iterator(1);
3973       auto Bperm = thrust::make_constant_iterator(0);
3974 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3975       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3976       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3977 #else
3978       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3979       auto Bcib = Bcsr->column_indices->begin();
3980       auto Bcie = Bcsr->column_indices->end();
3981       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3982 #endif
3983       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3984       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3985       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3986       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3987       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3988       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3989       auto p1 = Ccusp->cooPerm->begin();
3990       auto p2 = Ccusp->cooPerm->begin();
3991       thrust::advance(p2,Annz);
3992       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3993 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3994       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3995 #endif
3996       auto cci = thrust::make_counting_iterator(zero);
3997       auto cce = thrust::make_counting_iterator(c->nz);
3998 #if 0 //Errors on SUMMIT cuda 11.1.0
3999       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4000 #else
4001       auto pred = thrust::identity<int>();
4002       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4003       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4004 #endif
4005       stat = cusparseXcoo2csr(Ccusp->handle,
4006                               Ccoo->data().get(),
4007                               c->nz,
4008                               m,
4009                               Ccsr->row_offsets->data().get(),
4010                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4011       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4012       delete wPerm;
4013       delete Acoo;
4014       delete Bcoo;
4015       delete Ccoo;
4016 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4017       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4018                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4019                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4020                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4021 #endif
4022       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4023         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4024         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4025         CsrMatrix *CcsrT = new CsrMatrix;
4026         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4027         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4028 
4029         (*C)->form_explicit_transpose = PETSC_TRUE;
4030         (*C)->transupdated = PETSC_TRUE;
4031         Ccusp->rowoffsets_gpu = NULL;
4032         CmatT->cprowIndices = NULL;
4033         CmatT->mat = CcsrT;
4034         CcsrT->num_rows = n;
4035         CcsrT->num_cols = m;
4036         CcsrT->num_entries = c->nz;
4037 
4038         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4039         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4040         CcsrT->values = new THRUSTARRAY(c->nz);
4041 
4042         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4043         auto rT = CcsrT->row_offsets->begin();
4044         if (AT) {
4045           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4046           thrust::advance(rT,-1);
4047         }
4048         if (BT) {
4049           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4050           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4051           thrust::copy(titb,tite,rT);
4052         }
4053         auto cT = CcsrT->column_indices->begin();
4054         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4055         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4056         auto vT = CcsrT->values->begin();
4057         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4058         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4059         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4060 
4061         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4062         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4063         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4064         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4065         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4066         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4067         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4068         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4069         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4071         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4072                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4073                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4074                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4075 #endif
4076         Ccusp->matTranspose = CmatT;
4077       }
4078     }
4079 
4080     c->singlemalloc = PETSC_FALSE;
4081     c->free_a       = PETSC_TRUE;
4082     c->free_ij      = PETSC_TRUE;
4083     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4084     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4085     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4086       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4087       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4088       ii   = *Ccsr->row_offsets;
4089       jj   = *Ccsr->column_indices;
4090       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4091       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4092     } else {
4093       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4094       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4095     }
4096     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4097     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4098     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4099     c->maxnz = c->nz;
4100     c->nonzerorowcnt = 0;
4101     c->rmax = 0;
4102     for (i = 0; i < m; i++) {
4103       const PetscInt nn = c->i[i+1] - c->i[i];
4104       c->ilen[i] = c->imax[i] = nn;
4105       c->nonzerorowcnt += (PetscInt)!!nn;
4106       c->rmax = PetscMax(c->rmax,nn);
4107     }
4108     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4109     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4110     (*C)->nonzerostate++;
4111     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4112     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4113     Ccusp->nonzerostate = (*C)->nonzerostate;
4114     (*C)->preallocated  = PETSC_TRUE;
4115   } else {
4116     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4117     c = (Mat_SeqAIJ*)(*C)->data;
4118     if (c->nz) {
4119       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4120       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4121       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4122       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4123       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4124       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4125       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4126       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4127       Acsr = (CsrMatrix*)Acusp->mat->mat;
4128       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4129       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4130       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4131       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4132       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4133       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4134       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4135       auto pmid = Ccusp->cooPerm->begin();
4136       thrust::advance(pmid,Acsr->num_entries);
4137       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4138       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4139                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4140       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4141                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4142       thrust::for_each(zibait,zieait,VecCUDAEquals());
4143       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4144                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4145       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4146                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4147       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4148       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4149       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4150         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4151         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4152         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4153         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4154         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4155         auto vT = CcsrT->values->begin();
4156         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4157         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4158         (*C)->transupdated = PETSC_TRUE;
4159       }
4160       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4161     }
4162   }
4163   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4164   (*C)->assembled     = PETSC_TRUE;
4165   (*C)->was_assembled = PETSC_FALSE;
4166   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4167   PetscFunctionReturn(0);
4168 }
4169 
4170 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4171 {
4172   PetscErrorCode    ierr;
4173   bool              dmem;
4174   const PetscScalar *av;
4175   cudaError_t       cerr;
4176 
4177   PetscFunctionBegin;
4178   dmem = isCudaMem(v);
4179   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4180   if (n && idx) {
4181     THRUSTINTARRAY widx(n);
4182     widx.assign(idx,idx+n);
4183     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4184 
4185     THRUSTARRAY *w = NULL;
4186     thrust::device_ptr<PetscScalar> dv;
4187     if (dmem) {
4188       dv = thrust::device_pointer_cast(v);
4189     } else {
4190       w = new THRUSTARRAY(n);
4191       dv = w->data();
4192     }
4193     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4194 
4195     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4196     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4197     thrust::for_each(zibit,zieit,VecCUDAEquals());
4198     if (w) {
4199       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4200     }
4201     delete w;
4202   } else {
4203     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4204   }
4205   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4206   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4207   PetscFunctionReturn(0);
4208 }
4209