xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision fdce6efa4ac0151d65991f52b54a5692b77db69c)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 #include <thrust/async/for_each.h>
17 
18 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22 
23   typedef enum {
24       CUSPARSE_MV_ALG_DEFAULT = 0,
25       CUSPARSE_COOMV_ALG      = 1,
26       CUSPARSE_CSRMV_ALG1     = 2,
27       CUSPARSE_CSRMV_ALG2     = 3
28   } cusparseSpMVAlg_t;
29 
30   typedef enum {
31       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37       CUSPARSE_SPMM_COO_ALG1    = 1,
38       CUSPARSE_SPMM_COO_ALG2    = 2,
39       CUSPARSE_SPMM_COO_ALG3    = 3,
40       CUSPARSE_SPMM_COO_ALG4    = 5,
41       CUSPARSE_SPMM_CSR_ALG1    = 4,
42       CUSPARSE_SPMM_CSR_ALG2    = 6,
43   } cusparseSpMMAlg_t;
44 
45   typedef enum {
46       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48   } cusparseCsr2CscAlg_t;
49   */
50   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53 #endif
54 
55 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58 
59 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
61 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62 
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
64 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
66 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
67 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
69 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
70 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
77 
78 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83 
84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
87 
88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 
93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100   cusparsestruct->stream = stream;
101   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102   PetscFunctionReturn(0);
103 }
104 
105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106 {
107   cusparseStatus_t   stat;
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
115     }
116     cusparsestruct->handle = handle;
117   }
118   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126   PetscErrorCode     ierr;
127 
128   PetscFunctionBegin;
129   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
130   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
136 {
137   PetscFunctionBegin;
138   *type = MATSOLVERCUSPARSE;
139   PetscFunctionReturn(0);
140 }
141 
142 /*MC
143   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148   algorithms are not recommended. This class does NOT support direct solver operations.
149 
150   Level: beginner
151 
152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153 M*/
154 
155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
156 {
157   PetscErrorCode ierr;
158   PetscInt       n = A->rmap->n;
159 
160   PetscFunctionBegin;
161   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
163   (*B)->factortype = ftype;
164   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
165 
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
172     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
173   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
174     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
175     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
176     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
178   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
179 
180   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
181   (*B)->canuseordering = PETSC_TRUE;
182   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187 {
188   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
189 
190   PetscFunctionBegin;
191   switch (op) {
192   case MAT_CUSPARSE_MULT:
193     cusparsestruct->format = format;
194     break;
195   case MAT_CUSPARSE_ALL:
196     cusparsestruct->format = format;
197     break;
198   default:
199     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200   }
201   PetscFunctionReturn(0);
202 }
203 
204 /*@
205    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206    operation. Only the MatMult operation can use different GPU storage formats
207    for MPIAIJCUSPARSE matrices.
208    Not Collective
209 
210    Input Parameters:
211 +  A - Matrix of type SEQAIJCUSPARSE
212 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
213 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214 
215    Output Parameter:
216 
217    Level: intermediate
218 
219 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220 @*/
221 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222 {
223   PetscErrorCode ierr;
224 
225   PetscFunctionBegin;
226   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228   PetscFunctionReturn(0);
229 }
230 
231 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   switch (op) {
237     case MAT_FORM_EXPLICIT_TRANSPOSE:
238       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
239       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
240       A->form_explicit_transpose = flg;
241       break;
242     default:
243       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
244       break;
245   }
246   PetscFunctionReturn(0);
247 }
248 
249 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250 
251 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252 {
253   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254   IS             isrow = b->row,iscol = b->col;
255   PetscBool      row_identity,col_identity;
256   PetscErrorCode ierr;
257 
258   PetscFunctionBegin;
259   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261   B->offloadmask = PETSC_OFFLOAD_CPU;
262   /* determine which version of MatSolve needs to be used. */
263   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265   if (row_identity && col_identity) {
266     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268     B->ops->matsolve = NULL;
269     B->ops->matsolvetranspose = NULL;
270   } else {
271     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273     B->ops->matsolve = NULL;
274     B->ops->matsolvetranspose = NULL;
275   }
276 
277   /* get the triangular factors */
278   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279   PetscFunctionReturn(0);
280 }
281 
282 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
283 {
284   PetscErrorCode           ierr;
285   MatCUSPARSEStorageFormat format;
286   PetscBool                flg;
287   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
288 
289   PetscFunctionBegin;
290   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
291   if (A->factortype == MAT_FACTOR_NONE) {
292     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295 
296     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 
305     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308 
309     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312    #endif
313   }
314   ierr = PetscOptionsTail();CHKERRQ(ierr);
315   PetscFunctionReturn(0);
316 }
317 
318 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
319 {
320   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
321   PetscErrorCode               ierr;
322 
323   PetscFunctionBegin;
324   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
325   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
326   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
327   PetscFunctionReturn(0);
328 }
329 
330 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
331 {
332   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
333   PetscErrorCode               ierr;
334 
335   PetscFunctionBegin;
336   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
337   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
338   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
339   PetscFunctionReturn(0);
340 }
341 
342 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343 {
344   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345   PetscErrorCode               ierr;
346 
347   PetscFunctionBegin;
348   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351   PetscFunctionReturn(0);
352 }
353 
354 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355 {
356   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357   PetscErrorCode               ierr;
358 
359   PetscFunctionBegin;
360   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363   PetscFunctionReturn(0);
364 }
365 
366 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
367 {
368   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
369   PetscInt                          n = A->rmap->n;
370   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
372   cusparseStatus_t                  stat;
373   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
374   const MatScalar                   *aa = a->a,*v;
375   PetscInt                          *AiLo, *AjLo;
376   PetscInt                          i,nz, nzLower, offset, rowOffset;
377   PetscErrorCode                    ierr;
378   cudaError_t                       cerr;
379 
380   PetscFunctionBegin;
381   if (!n) PetscFunctionReturn(0);
382   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
383     try {
384       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
385       nzLower=n+ai[n]-ai[1];
386       if (!loTriFactor) {
387         PetscScalar                       *AALo;
388 
389         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
390 
391         /* Allocate Space for the lower triangular matrix */
392         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
393         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
394 
395         /* Fill the lower triangular matrix */
396         AiLo[0]  = (PetscInt) 0;
397         AiLo[n]  = nzLower;
398         AjLo[0]  = (PetscInt) 0;
399         AALo[0]  = (MatScalar) 1.0;
400         v        = aa;
401         vi       = aj;
402         offset   = 1;
403         rowOffset= 1;
404         for (i=1; i<n; i++) {
405           nz = ai[i+1] - ai[i];
406           /* additional 1 for the term on the diagonal */
407           AiLo[i]    = rowOffset;
408           rowOffset += nz+1;
409 
410           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
412 
413           offset      += nz;
414           AjLo[offset] = (PetscInt) i;
415           AALo[offset] = (MatScalar) 1.0;
416           offset      += 1;
417 
418           v  += nz;
419           vi += nz;
420         }
421 
422         /* allocate space for the triangular factor information */
423         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425         /* Create the matrix description */
426         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
427         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
428        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430        #else
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432        #endif
433         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
434         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435 
436         /* set the operation */
437         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438 
439         /* set the matrix */
440         loTriFactor->csrMat = new CsrMatrix;
441         loTriFactor->csrMat->num_rows = n;
442         loTriFactor->csrMat->num_cols = n;
443         loTriFactor->csrMat->num_entries = nzLower;
444 
445         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447 
448         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450 
451         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453 
454         /* Create the solve analysis information */
455         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
457       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464       #endif
465 
466         /* perform the solve analysis */
467         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
471                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473                                #endif
474 );CHKERRCUSPARSE(stat);
475         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477 
478         /* assign the pointer */
479         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
480         loTriFactor->AA_h = AALo;
481         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
482         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
483         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       } else { /* update values only */
485         if (!loTriFactor->AA_h) {
486           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
487         }
488         /* Fill the lower triangular matrix */
489         loTriFactor->AA_h[0]  = 1.0;
490         v        = aa;
491         vi       = aj;
492         offset   = 1;
493         for (i=1; i<n; i++) {
494           nz = ai[i+1] - ai[i];
495           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496           offset      += nz;
497           loTriFactor->AA_h[offset] = 1.0;
498           offset      += 1;
499           v  += nz;
500         }
501         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503       }
504     } catch(char *ex) {
505       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
506     }
507   }
508   PetscFunctionReturn(0);
509 }
510 
511 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
512 {
513   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
514   PetscInt                          n = A->rmap->n;
515   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
517   cusparseStatus_t                  stat;
518   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
519   const MatScalar                   *aa = a->a,*v;
520   PetscInt                          *AiUp, *AjUp;
521   PetscInt                          i,nz, nzUpper, offset;
522   PetscErrorCode                    ierr;
523   cudaError_t                       cerr;
524 
525   PetscFunctionBegin;
526   if (!n) PetscFunctionReturn(0);
527   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
528     try {
529       /* next, figure out the number of nonzeros in the upper triangular matrix. */
530       nzUpper = adiag[0]-adiag[n];
531       if (!upTriFactor) {
532         PetscScalar *AAUp;
533 
534         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
535 
536         /* Allocate Space for the upper triangular matrix */
537         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
538         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
539 
540         /* Fill the upper triangular matrix */
541         AiUp[0]=(PetscInt) 0;
542         AiUp[n]=nzUpper;
543         offset = nzUpper;
544         for (i=n-1; i>=0; i--) {
545           v  = aa + adiag[i+1] + 1;
546           vi = aj + adiag[i+1] + 1;
547 
548           /* number of elements NOT on the diagonal */
549           nz = adiag[i] - adiag[i+1]-1;
550 
551           /* decrement the offset */
552           offset -= (nz+1);
553 
554           /* first, set the diagonal elements */
555           AjUp[offset] = (PetscInt) i;
556           AAUp[offset] = (MatScalar)1./v[nz];
557           AiUp[i]      = AiUp[i+1] - (nz+1);
558 
559           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
561         }
562 
563         /* allocate space for the triangular factor information */
564         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
566 
567         /* Create the matrix description */
568         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
569         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
570        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572        #else
573         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574        #endif
575         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
576         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577 
578         /* set the operation */
579         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580 
581         /* set the matrix */
582         upTriFactor->csrMat = new CsrMatrix;
583         upTriFactor->csrMat->num_rows = n;
584         upTriFactor->csrMat->num_cols = n;
585         upTriFactor->csrMat->num_entries = nzUpper;
586 
587         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589 
590         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592 
593         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595 
596         /* Create the solve analysis information */
597         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
599       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606       #endif
607 
608         /* perform the solve analysis */
609         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
613                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615                                #endif
616 );CHKERRCUSPARSE(stat);
617         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619 
620         /* assign the pointer */
621         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
622         upTriFactor->AA_h = AAUp;
623         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
624         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
625         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626       } else {
627         if (!upTriFactor->AA_h) {
628           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
629         }
630         /* Fill the upper triangular matrix */
631         offset = nzUpper;
632         for (i=n-1; i>=0; i--) {
633           v  = aa + adiag[i+1] + 1;
634 
635           /* number of elements NOT on the diagonal */
636           nz = adiag[i] - adiag[i+1]-1;
637 
638           /* decrement the offset */
639           offset -= (nz+1);
640 
641           /* first, set the diagonal elements */
642           upTriFactor->AA_h[offset] = 1./v[nz];
643           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644         }
645         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647       }
648     } catch(char *ex) {
649       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
650     }
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
656 {
657   PetscErrorCode               ierr;
658   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
659   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
660   IS                           isrow = a->row,iscol = a->icol;
661   PetscBool                    row_identity,col_identity;
662   PetscInt                     n = A->rmap->n;
663 
664   PetscFunctionBegin;
665   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
668 
669   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670   cusparseTriFactors->nnz=a->nz;
671 
672   A->offloadmask = PETSC_OFFLOAD_BOTH;
673   /* lower triangular indices */
674   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676     const PetscInt *r;
677 
678     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680     cusparseTriFactors->rpermIndices->assign(r, r+n);
681     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683   }
684 
685   /* upper triangular indices */
686   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688     const PetscInt *c;
689 
690     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692     cusparseTriFactors->cpermIndices->assign(c, c+n);
693     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695   }
696   PetscFunctionReturn(0);
697 }
698 
699 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700 {
701   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705   cusparseStatus_t                  stat;
706   PetscErrorCode                    ierr;
707   cudaError_t                       cerr;
708   PetscInt                          *AiUp, *AjUp;
709   PetscScalar                       *AAUp;
710   PetscScalar                       *AALo;
711   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714   const MatScalar                   *aa = b->a,*v;
715 
716   PetscFunctionBegin;
717   if (!n) PetscFunctionReturn(0);
718   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719     try {
720       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722       if (!upTriFactor && !loTriFactor) {
723         /* Allocate Space for the upper triangular matrix */
724         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
725         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726 
727         /* Fill the upper triangular matrix */
728         AiUp[0]=(PetscInt) 0;
729         AiUp[n]=nzUpper;
730         offset = 0;
731         for (i=0; i<n; i++) {
732           /* set the pointers */
733           v  = aa + ai[i];
734           vj = aj + ai[i];
735           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736 
737           /* first, set the diagonal elements */
738           AjUp[offset] = (PetscInt) i;
739           AAUp[offset] = (MatScalar)1.0/v[nz];
740           AiUp[i]      = offset;
741           AALo[offset] = (MatScalar)1.0/v[nz];
742 
743           offset+=1;
744           if (nz>0) {
745             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747             for (j=offset; j<offset+nz; j++) {
748               AAUp[j] = -AAUp[j];
749               AALo[j] = AAUp[j]/v[nz];
750             }
751             offset+=nz;
752           }
753         }
754 
755         /* allocate space for the triangular factor information */
756         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758 
759         /* Create the matrix description */
760         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
761         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
762        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764        #else
765         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766        #endif
767         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
768         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769 
770         /* set the matrix */
771         upTriFactor->csrMat = new CsrMatrix;
772         upTriFactor->csrMat->num_rows = A->rmap->n;
773         upTriFactor->csrMat->num_cols = A->cmap->n;
774         upTriFactor->csrMat->num_entries = a->nz;
775 
776         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778 
779         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781 
782         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784 
785         /* set the operation */
786         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787 
788         /* Create the solve analysis information */
789         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
791       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798       #endif
799 
800         /* perform the solve analysis */
801         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
805                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807                                 #endif
808 );CHKERRCUSPARSE(stat);
809         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811 
812         /* assign the pointer */
813         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814 
815         /* allocate space for the triangular factor information */
816         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818 
819         /* Create the matrix description */
820         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
821         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
822        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824        #else
825         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826        #endif
827         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
828         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829 
830         /* set the operation */
831         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832 
833         /* set the matrix */
834         loTriFactor->csrMat = new CsrMatrix;
835         loTriFactor->csrMat->num_rows = A->rmap->n;
836         loTriFactor->csrMat->num_cols = A->cmap->n;
837         loTriFactor->csrMat->num_entries = a->nz;
838 
839         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841 
842         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844 
845         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847 
848         /* Create the solve analysis information */
849         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
851       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858       #endif
859 
860         /* perform the solve analysis */
861         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
865                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867                                 #endif
868 );CHKERRCUSPARSE(stat);
869         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871 
872         /* assign the pointer */
873         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874 
875         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
876         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
877         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878       } else {
879         /* Fill the upper triangular matrix */
880         offset = 0;
881         for (i=0; i<n; i++) {
882           /* set the pointers */
883           v  = aa + ai[i];
884           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885 
886           /* first, set the diagonal elements */
887           AAUp[offset] = 1.0/v[nz];
888           AALo[offset] = 1.0/v[nz];
889 
890           offset+=1;
891           if (nz>0) {
892             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893             for (j=offset; j<offset+nz; j++) {
894               AAUp[j] = -AAUp[j];
895               AALo[j] = AAUp[j]/v[nz];
896             }
897             offset+=nz;
898           }
899         }
900         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905       }
906       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
907       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908     } catch(char *ex) {
909       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910     }
911   }
912   PetscFunctionReturn(0);
913 }
914 
915 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
916 {
917   PetscErrorCode               ierr;
918   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920   IS                           ip = a->row;
921   PetscBool                    perm_identity;
922   PetscInt                     n = A->rmap->n;
923 
924   PetscFunctionBegin;
925   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929 
930   A->offloadmask = PETSC_OFFLOAD_BOTH;
931 
932   /* lower triangular indices */
933   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934   if (!perm_identity) {
935     IS             iip;
936     const PetscInt *irip,*rip;
937 
938     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
939     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
944     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
945     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
946     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949   }
950   PetscFunctionReturn(0);
951 }
952 
953 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954 {
955   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956   IS             ip = b->row;
957   PetscBool      perm_identity;
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
962   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
963   B->offloadmask = PETSC_OFFLOAD_CPU;
964   /* determine which version of MatSolve needs to be used. */
965   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
966   if (perm_identity) {
967     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
969     B->ops->matsolve = NULL;
970     B->ops->matsolvetranspose = NULL;
971   } else {
972     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
974     B->ops->matsolve = NULL;
975     B->ops->matsolvetranspose = NULL;
976   }
977 
978   /* get the triangular factors */
979   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
980   PetscFunctionReturn(0);
981 }
982 
983 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984 {
985   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990   cusparseStatus_t                  stat;
991   cusparseIndexBase_t               indexBase;
992   cusparseMatrixType_t              matrixType;
993   cusparseFillMode_t                fillMode;
994   cusparseDiagType_t                diagType;
995   cudaError_t                       cerr;
996   PetscErrorCode                    ierr;
997 
998   PetscFunctionBegin;
999   /* allocate space for the transpose of the lower triangular factor */
1000   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1001   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002 
1003   /* set the matrix descriptors of the lower triangular factor */
1004   matrixType = cusparseGetMatType(loTriFactor->descr);
1005   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009 
1010   /* Create the matrix description */
1011   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1012   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1013   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1014   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1015   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1016 
1017   /* set the operation */
1018   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019 
1020   /* allocate GPU space for the CSC of the lower triangular factor*/
1021   loTriFactorT->csrMat = new CsrMatrix;
1022   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028 
1029   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033                                        loTriFactor->csrMat->values->data().get(),
1034                                        loTriFactor->csrMat->row_offsets->data().get(),
1035                                        loTriFactor->csrMat->column_indices->data().get(),
1036                                        loTriFactorT->csrMat->values->data().get(),
1037                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1040   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041 #endif
1042 
1043   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1044   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046                           loTriFactor->csrMat->values->data().get(),
1047                           loTriFactor->csrMat->row_offsets->data().get(),
1048                           loTriFactor->csrMat->column_indices->data().get(),
1049                           loTriFactorT->csrMat->values->data().get(),
1050                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054                         #else
1055                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056                           CUSPARSE_ACTION_NUMERIC, indexBase
1057                         #endif
1058 );CHKERRCUSPARSE(stat);
1059   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061 
1062   /* Create the solve analysis information */
1063   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1064   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1065 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072 #endif
1073 
1074   /* perform the solve analysis */
1075   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1079                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081                           #endif
1082 );CHKERRCUSPARSE(stat);
1083   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1085 
1086   /* assign the pointer */
1087   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1088 
1089   /*********************************************/
1090   /* Now the Transpose of the Upper Tri Factor */
1091   /*********************************************/
1092 
1093   /* allocate space for the transpose of the upper triangular factor */
1094   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1095   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1096 
1097   /* set the matrix descriptors of the upper triangular factor */
1098   matrixType = cusparseGetMatType(upTriFactor->descr);
1099   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1103 
1104   /* Create the matrix description */
1105   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1106   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1107   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1108   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1109   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1110 
1111   /* set the operation */
1112   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1113 
1114   /* allocate GPU space for the CSC of the upper triangular factor*/
1115   upTriFactorT->csrMat = new CsrMatrix;
1116   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1122 
1123   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127                                 upTriFactor->csrMat->values->data().get(),
1128                                 upTriFactor->csrMat->row_offsets->data().get(),
1129                                 upTriFactor->csrMat->column_indices->data().get(),
1130                                 upTriFactorT->csrMat->values->data().get(),
1131                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135 #endif
1136 
1137   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1138   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140                           upTriFactor->csrMat->values->data().get(),
1141                           upTriFactor->csrMat->row_offsets->data().get(),
1142                           upTriFactor->csrMat->column_indices->data().get(),
1143                           upTriFactorT->csrMat->values->data().get(),
1144                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148                         #else
1149                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150                           CUSPARSE_ACTION_NUMERIC, indexBase
1151                         #endif
1152 );CHKERRCUSPARSE(stat);
1153   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155 
1156   /* Create the solve analysis information */
1157   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1159   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166   #endif
1167 
1168   /* perform the solve analysis */
1169   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1173                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175                           #endif
1176 );CHKERRCUSPARSE(stat);
1177   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1179 
1180   /* assign the pointer */
1181   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 struct PetscScalarToPetscInt
1186 {
1187   __host__ __device__
1188   PetscInt operator()(PetscScalar s)
1189   {
1190     return (PetscInt)PetscRealPart(s);
1191   }
1192 };
1193 
1194 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1195 {
1196   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1197   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1198   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1199   cusparseStatus_t             stat;
1200   cusparseIndexBase_t          indexBase;
1201   cudaError_t                  err;
1202   PetscErrorCode               ierr;
1203 
1204   PetscFunctionBegin;
1205   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1206   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1207   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1208   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1209   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1210   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1211   if (A->transupdated) PetscFunctionReturn(0);
1212   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1213   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1214     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1215   }
1216   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1218     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1219     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1220     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1221     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1222 
1223     /* set alpha and beta */
1224     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1225     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1226     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1227     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230 
1231     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232       CsrMatrix *matrixT = new CsrMatrix;
1233       matstructT->mat = matrixT;
1234       matrixT->num_rows = A->cmap->n;
1235       matrixT->num_cols = A->rmap->n;
1236       matrixT->num_entries = a->nz;
1237       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239       matrixT->values = new THRUSTARRAY(a->nz);
1240 
1241       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243 
1244      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245       stat = cusparseCreateCsr(&matstructT->matDescr,
1246                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1247                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1248                                matrixT->values->data().get(),
1249                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1250                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1251      #endif
1252     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1253    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1254       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1255    #else
1256       CsrMatrix *temp  = new CsrMatrix;
1257       CsrMatrix *tempT = new CsrMatrix;
1258       /* First convert HYB to CSR */
1259       temp->num_rows = A->rmap->n;
1260       temp->num_cols = A->cmap->n;
1261       temp->num_entries = a->nz;
1262       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1263       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1264       temp->values = new THRUSTARRAY(a->nz);
1265 
1266       stat = cusparse_hyb2csr(cusparsestruct->handle,
1267                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1268                               temp->values->data().get(),
1269                               temp->row_offsets->data().get(),
1270                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1271 
1272       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1273       tempT->num_rows = A->rmap->n;
1274       tempT->num_cols = A->cmap->n;
1275       tempT->num_entries = a->nz;
1276       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1278       tempT->values = new THRUSTARRAY(a->nz);
1279 
1280       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1281                               temp->num_cols, temp->num_entries,
1282                               temp->values->data().get(),
1283                               temp->row_offsets->data().get(),
1284                               temp->column_indices->data().get(),
1285                               tempT->values->data().get(),
1286                               tempT->column_indices->data().get(),
1287                               tempT->row_offsets->data().get(),
1288                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1289 
1290       /* Last, convert CSC to HYB */
1291       cusparseHybMat_t hybMat;
1292       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1293       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1294         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1295       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1296                               matstructT->descr, tempT->values->data().get(),
1297                               tempT->row_offsets->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1300 
1301       /* assign the pointer */
1302       matstructT->mat = hybMat;
1303       A->transupdated = PETSC_TRUE;
1304       /* delete temporaries */
1305       if (tempT) {
1306         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1307         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1308         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1309         delete (CsrMatrix*) tempT;
1310       }
1311       if (temp) {
1312         if (temp->values) delete (THRUSTARRAY*) temp->values;
1313         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1314         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1315         delete (CsrMatrix*) temp;
1316       }
1317      #endif
1318     }
1319   }
1320   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1321     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1322     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1323     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1324     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1325     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1326     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1327     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1328     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1329     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1330     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1331     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1332       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1333       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1334       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1335     }
1336     if (!cusparsestruct->csr2csc_i) {
1337       THRUSTARRAY csr2csc_a(matrix->num_entries);
1338       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1339 
1340       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1341      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1342       void   *csr2cscBuffer;
1343       size_t csr2cscBufferSize;
1344       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1345                                            A->cmap->n, matrix->num_entries,
1346                                            matrix->values->data().get(),
1347                                            cusparsestruct->rowoffsets_gpu->data().get(),
1348                                            matrix->column_indices->data().get(),
1349                                            matrixT->values->data().get(),
1350                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1351                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1352                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1353       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1354      #endif
1355 
1356       if (matrix->num_entries) {
1357         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1358            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1359            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1360 
1361            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1362            should be filled with indexBase. So I just take a shortcut here.
1363         */
1364         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1365                               A->cmap->n,matrix->num_entries,
1366                               csr2csc_a.data().get(),
1367                               cusparsestruct->rowoffsets_gpu->data().get(),
1368                               matrix->column_indices->data().get(),
1369                               matrixT->values->data().get(),
1370                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1372                               CUSPARSE_ACTION_NUMERIC,indexBase,
1373                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1374                              #else
1375                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1376                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1377                              #endif
1378       } else {
1379         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1380       }
1381 
1382       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1383       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1384      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1386      #endif
1387     }
1388     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1389                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1390                                                      matrixT->values->begin()));
1391   }
1392   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1393   /* the compressed row indices is not used for matTranspose */
1394   matstructT->cprowIndices = NULL;
1395   /* assign the pointer */
1396   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1397   A->transupdated = PETSC_TRUE;
1398   PetscFunctionReturn(0);
1399 }
1400 
1401 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1402 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1403 {
1404   PetscInt                              n = xx->map->n;
1405   const PetscScalar                     *barray;
1406   PetscScalar                           *xarray;
1407   thrust::device_ptr<const PetscScalar> bGPU;
1408   thrust::device_ptr<PetscScalar>       xGPU;
1409   cusparseStatus_t                      stat;
1410   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1411   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1412   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1413   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1414   PetscErrorCode                        ierr;
1415   cudaError_t                           cerr;
1416 
1417   PetscFunctionBegin;
1418   /* Analyze the matrix and create the transpose ... on the fly */
1419   if (!loTriFactorT && !upTriFactorT) {
1420     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1421     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423   }
1424 
1425   /* Get the GPU pointers */
1426   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1427   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1428   xGPU = thrust::device_pointer_cast(xarray);
1429   bGPU = thrust::device_pointer_cast(barray);
1430 
1431   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1432   /* First, reorder with the row permutation */
1433   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1434                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1435                xGPU);
1436 
1437   /* First, solve U */
1438   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1439                         upTriFactorT->csrMat->num_rows,
1440                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1441                         upTriFactorT->csrMat->num_entries,
1442                       #endif
1443                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1444                         upTriFactorT->csrMat->values->data().get(),
1445                         upTriFactorT->csrMat->row_offsets->data().get(),
1446                         upTriFactorT->csrMat->column_indices->data().get(),
1447                         upTriFactorT->solveInfo,
1448                         xarray, tempGPU->data().get()
1449                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1450                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1451                       #endif
1452 );CHKERRCUSPARSE(stat);
1453 
1454   /* Then, solve L */
1455   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1456                         loTriFactorT->csrMat->num_rows,
1457                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1458                         loTriFactorT->csrMat->num_entries,
1459                       #endif
1460                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1461                         loTriFactorT->csrMat->values->data().get(),
1462                         loTriFactorT->csrMat->row_offsets->data().get(),
1463                         loTriFactorT->csrMat->column_indices->data().get(),
1464                         loTriFactorT->solveInfo,
1465                         tempGPU->data().get(), xarray
1466                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1468                       #endif
1469 );CHKERRCUSPARSE(stat);
1470 
1471   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1472   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1473                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1474                tempGPU->begin());
1475 
1476   /* Copy the temporary to the full solution. */
1477   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1478 
1479   /* restore */
1480   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1481   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1482   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1483   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1485   PetscFunctionReturn(0);
1486 }
1487 
1488 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1489 {
1490   const PetscScalar                 *barray;
1491   PetscScalar                       *xarray;
1492   cusparseStatus_t                  stat;
1493   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1494   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1495   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1496   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1497   PetscErrorCode                    ierr;
1498   cudaError_t                       cerr;
1499 
1500   PetscFunctionBegin;
1501   /* Analyze the matrix and create the transpose ... on the fly */
1502   if (!loTriFactorT && !upTriFactorT) {
1503     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1504     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1505     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1506   }
1507 
1508   /* Get the GPU pointers */
1509   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1510   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1511 
1512   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1513   /* First, solve U */
1514   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1515                         upTriFactorT->csrMat->num_rows,
1516                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1517                         upTriFactorT->csrMat->num_entries,
1518                       #endif
1519                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1520                         upTriFactorT->csrMat->values->data().get(),
1521                         upTriFactorT->csrMat->row_offsets->data().get(),
1522                         upTriFactorT->csrMat->column_indices->data().get(),
1523                         upTriFactorT->solveInfo,
1524                         barray, tempGPU->data().get()
1525                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1526                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1527                       #endif
1528 );CHKERRCUSPARSE(stat);
1529 
1530   /* Then, solve L */
1531   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1532                         loTriFactorT->csrMat->num_rows,
1533                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534                         loTriFactorT->csrMat->num_entries,
1535                       #endif
1536                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1537                         loTriFactorT->csrMat->values->data().get(),
1538                         loTriFactorT->csrMat->row_offsets->data().get(),
1539                         loTriFactorT->csrMat->column_indices->data().get(),
1540                         loTriFactorT->solveInfo,
1541                         tempGPU->data().get(), xarray
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1544                       #endif
1545 );CHKERRCUSPARSE(stat);
1546 
1547   /* restore */
1548   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1549   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1550   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1551   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1552   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1557 {
1558   const PetscScalar                     *barray;
1559   PetscScalar                           *xarray;
1560   thrust::device_ptr<const PetscScalar> bGPU;
1561   thrust::device_ptr<PetscScalar>       xGPU;
1562   cusparseStatus_t                      stat;
1563   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1564   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1565   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1566   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1567   PetscErrorCode                        ierr;
1568   cudaError_t                           cerr;
1569 
1570   PetscFunctionBegin;
1571 
1572   /* Get the GPU pointers */
1573   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1574   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1575   xGPU = thrust::device_pointer_cast(xarray);
1576   bGPU = thrust::device_pointer_cast(barray);
1577 
1578   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1579   /* First, reorder with the row permutation */
1580   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1581                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1582                tempGPU->begin());
1583 
1584   /* Next, solve L */
1585   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1586                         loTriFactor->csrMat->num_rows,
1587                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1588                         loTriFactor->csrMat->num_entries,
1589                       #endif
1590                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1591                         loTriFactor->csrMat->values->data().get(),
1592                         loTriFactor->csrMat->row_offsets->data().get(),
1593                         loTriFactor->csrMat->column_indices->data().get(),
1594                         loTriFactor->solveInfo,
1595                         tempGPU->data().get(), xarray
1596                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1597                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1598                       #endif
1599 );CHKERRCUSPARSE(stat);
1600 
1601   /* Then, solve U */
1602   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1603                         upTriFactor->csrMat->num_rows,
1604                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1605                         upTriFactor->csrMat->num_entries,
1606                       #endif
1607                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1608                         upTriFactor->csrMat->values->data().get(),
1609                         upTriFactor->csrMat->row_offsets->data().get(),
1610                         upTriFactor->csrMat->column_indices->data().get(),
1611                         upTriFactor->solveInfo,
1612                         xarray, tempGPU->data().get()
1613                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1615                       #endif
1616 );CHKERRCUSPARSE(stat);
1617 
1618   /* Last, reorder with the column permutation */
1619   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1620                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1621                xGPU);
1622 
1623   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1624   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1625   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1626   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1627   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1628   PetscFunctionReturn(0);
1629 }
1630 
1631 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1632 {
1633   const PetscScalar                 *barray;
1634   PetscScalar                       *xarray;
1635   cusparseStatus_t                  stat;
1636   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1637   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1638   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1639   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1640   PetscErrorCode                    ierr;
1641   cudaError_t                       cerr;
1642 
1643   PetscFunctionBegin;
1644   /* Get the GPU pointers */
1645   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1646   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1647 
1648   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1649   /* First, solve L */
1650   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1651                         loTriFactor->csrMat->num_rows,
1652                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1653                         loTriFactor->csrMat->num_entries,
1654                       #endif
1655                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1656                         loTriFactor->csrMat->values->data().get(),
1657                         loTriFactor->csrMat->row_offsets->data().get(),
1658                         loTriFactor->csrMat->column_indices->data().get(),
1659                         loTriFactor->solveInfo,
1660                         barray, tempGPU->data().get()
1661                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1662                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1663                       #endif
1664 );CHKERRCUSPARSE(stat);
1665 
1666   /* Next, solve U */
1667   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1668                         upTriFactor->csrMat->num_rows,
1669                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670                         upTriFactor->csrMat->num_entries,
1671                       #endif
1672                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1673                         upTriFactor->csrMat->values->data().get(),
1674                         upTriFactor->csrMat->row_offsets->data().get(),
1675                         upTriFactor->csrMat->column_indices->data().get(),
1676                         upTriFactor->solveInfo,
1677                         tempGPU->data().get(), xarray
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1680                       #endif
1681 );CHKERRCUSPARSE(stat);
1682 
1683   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1684   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1685   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1686   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1687   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1688   PetscFunctionReturn(0);
1689 }
1690 
1691 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1692 {
1693   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1694   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1695   cudaError_t        cerr;
1696   PetscErrorCode     ierr;
1697 
1698   PetscFunctionBegin;
1699   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1700     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1701 
1702     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1703     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1704     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1705     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1706     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1707     A->offloadmask = PETSC_OFFLOAD_BOTH;
1708   }
1709   PetscFunctionReturn(0);
1710 }
1711 
1712 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1713 {
1714   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1715   PetscErrorCode ierr;
1716 
1717   PetscFunctionBegin;
1718   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1719   *array = a->a;
1720   A->offloadmask = PETSC_OFFLOAD_CPU;
1721   PetscFunctionReturn(0);
1722 }
1723 
1724 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1725 {
1726   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1727   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1728   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1729   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1730   PetscErrorCode               ierr;
1731   cusparseStatus_t             stat;
1732   PetscBool                    both = PETSC_TRUE;
1733   cudaError_t                  err;
1734 
1735   PetscFunctionBegin;
1736   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1737   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1738     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1739       CsrMatrix *matrix;
1740       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1741 
1742       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1743       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1744       matrix->values->assign(a->a, a->a+a->nz);
1745       err  = WaitForCUDA();CHKERRCUDA(err);
1746       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1747       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1748       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1749     } else {
1750       PetscInt nnz;
1751       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1752       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1753       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1754       delete cusparsestruct->workVector;
1755       delete cusparsestruct->rowoffsets_gpu;
1756       cusparsestruct->workVector = NULL;
1757       cusparsestruct->rowoffsets_gpu = NULL;
1758       try {
1759         if (a->compressedrow.use) {
1760           m    = a->compressedrow.nrows;
1761           ii   = a->compressedrow.i;
1762           ridx = a->compressedrow.rindex;
1763         } else {
1764           m    = A->rmap->n;
1765           ii   = a->i;
1766           ridx = NULL;
1767         }
1768         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1769         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1770         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1771         else nnz = a->nz;
1772 
1773         /* create cusparse matrix */
1774         cusparsestruct->nrows = m;
1775         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1776         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1777         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1778         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1779 
1780         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1781         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1782         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1783         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1784         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1785         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1786         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1787 
1788         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1789         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1790           /* set the matrix */
1791           CsrMatrix *mat= new CsrMatrix;
1792           mat->num_rows = m;
1793           mat->num_cols = A->cmap->n;
1794           mat->num_entries = nnz;
1795           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1796           mat->row_offsets->assign(ii, ii + m+1);
1797 
1798           mat->column_indices = new THRUSTINTARRAY32(nnz);
1799           mat->column_indices->assign(a->j, a->j+nnz);
1800 
1801           mat->values = new THRUSTARRAY(nnz);
1802           if (a->a) mat->values->assign(a->a, a->a+nnz);
1803 
1804           /* assign the pointer */
1805           matstruct->mat = mat;
1806          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1807           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1808             stat = cusparseCreateCsr(&matstruct->matDescr,
1809                                     mat->num_rows, mat->num_cols, mat->num_entries,
1810                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1811                                     mat->values->data().get(),
1812                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1813                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1814           }
1815          #endif
1816         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1817          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1818           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1819          #else
1820           CsrMatrix *mat= new CsrMatrix;
1821           mat->num_rows = m;
1822           mat->num_cols = A->cmap->n;
1823           mat->num_entries = nnz;
1824           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1825           mat->row_offsets->assign(ii, ii + m+1);
1826 
1827           mat->column_indices = new THRUSTINTARRAY32(nnz);
1828           mat->column_indices->assign(a->j, a->j+nnz);
1829 
1830           mat->values = new THRUSTARRAY(nnz);
1831           if (a->a) mat->values->assign(a->a, a->a+nnz);
1832 
1833           cusparseHybMat_t hybMat;
1834           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1835           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1836             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1837           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1838               matstruct->descr, mat->values->data().get(),
1839               mat->row_offsets->data().get(),
1840               mat->column_indices->data().get(),
1841               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1842           /* assign the pointer */
1843           matstruct->mat = hybMat;
1844 
1845           if (mat) {
1846             if (mat->values) delete (THRUSTARRAY*)mat->values;
1847             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1848             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1849             delete (CsrMatrix*)mat;
1850           }
1851          #endif
1852         }
1853 
1854         /* assign the compressed row indices */
1855         if (a->compressedrow.use) {
1856           cusparsestruct->workVector = new THRUSTARRAY(m);
1857           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1858           matstruct->cprowIndices->assign(ridx,ridx+m);
1859           tmp = m;
1860         } else {
1861           cusparsestruct->workVector = NULL;
1862           matstruct->cprowIndices    = NULL;
1863           tmp = 0;
1864         }
1865         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1866 
1867         /* assign the pointer */
1868         cusparsestruct->mat = matstruct;
1869       } catch(char *ex) {
1870         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1871       }
1872       err  = WaitForCUDA();CHKERRCUDA(err);
1873       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1874       cusparsestruct->nonzerostate = A->nonzerostate;
1875     }
1876     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1877   }
1878   PetscFunctionReturn(0);
1879 }
1880 
1881 struct VecCUDAPlusEquals
1882 {
1883   template <typename Tuple>
1884   __host__ __device__
1885   void operator()(Tuple t)
1886   {
1887     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1888   }
1889 };
1890 
1891 struct VecCUDAEquals
1892 {
1893   template <typename Tuple>
1894   __host__ __device__
1895   void operator()(Tuple t)
1896   {
1897     thrust::get<1>(t) = thrust::get<0>(t);
1898   }
1899 };
1900 
1901 struct VecCUDAEqualsReverse
1902 {
1903   template <typename Tuple>
1904   __host__ __device__
1905   void operator()(Tuple t)
1906   {
1907     thrust::get<0>(t) = thrust::get<1>(t);
1908   }
1909 };
1910 
1911 struct MatMatCusparse {
1912   PetscBool             cisdense;
1913   PetscScalar           *Bt;
1914   Mat                   X;
1915   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1916   PetscLogDouble        flops;
1917   CsrMatrix             *Bcsr;
1918 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1919   cusparseSpMatDescr_t  matSpBDescr;
1920   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1921   cusparseDnMatDescr_t  matBDescr;
1922   cusparseDnMatDescr_t  matCDescr;
1923   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1924   size_t                mmBufferSize;
1925   void                  *mmBuffer;
1926   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1927   cusparseSpGEMMDescr_t spgemmDesc;
1928 #endif
1929 };
1930 
1931 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1932 {
1933   PetscErrorCode   ierr;
1934   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1935   cudaError_t      cerr;
1936  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1937   cusparseStatus_t stat;
1938  #endif
1939 
1940   PetscFunctionBegin;
1941   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1942   delete mmdata->Bcsr;
1943  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1944   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1945   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1946   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1947   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1948   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1949   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1950  #endif
1951   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1952   ierr = PetscFree(data);CHKERRQ(ierr);
1953   PetscFunctionReturn(0);
1954 }
1955 
1956 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1957 
1958 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1959 {
1960   Mat_Product                  *product = C->product;
1961   Mat                          A,B;
1962   PetscInt                     m,n,blda,clda;
1963   PetscBool                    flg,biscuda;
1964   Mat_SeqAIJCUSPARSE           *cusp;
1965   cusparseStatus_t             stat;
1966   cusparseOperation_t          opA;
1967   const PetscScalar            *barray;
1968   PetscScalar                  *carray;
1969   PetscErrorCode               ierr;
1970   MatMatCusparse               *mmdata;
1971   Mat_SeqAIJCUSPARSEMultStruct *mat;
1972   CsrMatrix                    *csrmat;
1973   cudaError_t                  cerr;
1974 
1975   PetscFunctionBegin;
1976   MatCheckProduct(C,1);
1977   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1978   mmdata = (MatMatCusparse*)product->data;
1979   A    = product->A;
1980   B    = product->B;
1981   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1982   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
1983   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1984      Instead of silently accepting the wrong answer, I prefer to raise the error */
1985   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1986   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1987   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1988   switch (product->type) {
1989   case MATPRODUCT_AB:
1990   case MATPRODUCT_PtAP:
1991     mat = cusp->mat;
1992     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1993     m   = A->rmap->n;
1994     n   = B->cmap->n;
1995     break;
1996   case MATPRODUCT_AtB:
1997     if (!A->form_explicit_transpose) {
1998       mat = cusp->mat;
1999       opA = CUSPARSE_OPERATION_TRANSPOSE;
2000     } else {
2001       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2002       mat  = cusp->matTranspose;
2003       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2004     }
2005     m = A->cmap->n;
2006     n = B->cmap->n;
2007     break;
2008   case MATPRODUCT_ABt:
2009   case MATPRODUCT_RARt:
2010     mat = cusp->mat;
2011     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012     m   = A->rmap->n;
2013     n   = B->rmap->n;
2014     break;
2015   default:
2016     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2017   }
2018   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2019   csrmat = (CsrMatrix*)mat->mat;
2020   /* if the user passed a CPU matrix, copy the data to the GPU */
2021   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2022   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2023   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2024 
2025   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2026   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2027     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2028     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2029   } else {
2030     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2031     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2032   }
2033 
2034   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2035  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2036   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2037   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2038   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2039     size_t mmBufferSize;
2040     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2041     if (!mmdata->matBDescr) {
2042       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2043       mmdata->Blda = blda;
2044     }
2045 
2046     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2047     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2048       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2049       mmdata->Clda = clda;
2050     }
2051 
2052     if (!mat->matDescr) {
2053       stat = cusparseCreateCsr(&mat->matDescr,
2054                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2055                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2056                                csrmat->values->data().get(),
2057                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2058                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2059     }
2060     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2061                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2062                                    mmdata->matCDescr,cusparse_scalartype,
2063                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2064     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2065       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2066       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2067       mmdata->mmBufferSize = mmBufferSize;
2068     }
2069     mmdata->initialized = PETSC_TRUE;
2070   } else {
2071     /* to be safe, always update pointers of the mats */
2072     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2073     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2074     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2075   }
2076 
2077   /* do cusparseSpMM, which supports transpose on B */
2078   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2079                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2080                       mmdata->matCDescr,cusparse_scalartype,
2081                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2082  #else
2083   PetscInt k;
2084   /* cusparseXcsrmm does not support transpose on B */
2085   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2086     cublasHandle_t cublasv2handle;
2087     cublasStatus_t cerr;
2088 
2089     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2090     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2091                        B->cmap->n,B->rmap->n,
2092                        &PETSC_CUSPARSE_ONE ,barray,blda,
2093                        &PETSC_CUSPARSE_ZERO,barray,blda,
2094                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2095     blda = B->cmap->n;
2096     k    = B->cmap->n;
2097   } else {
2098     k    = B->rmap->n;
2099   }
2100 
2101   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2102   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2103                            csrmat->num_entries,mat->alpha_one,mat->descr,
2104                            csrmat->values->data().get(),
2105                            csrmat->row_offsets->data().get(),
2106                            csrmat->column_indices->data().get(),
2107                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2108                            carray,clda);CHKERRCUSPARSE(stat);
2109  #endif
2110   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2111   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2112   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2113   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2114   if (product->type == MATPRODUCT_RARt) {
2115     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2116     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2117   } else if (product->type == MATPRODUCT_PtAP) {
2118     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2119     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2120   } else {
2121     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2122   }
2123   if (mmdata->cisdense) {
2124     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2125   }
2126   if (!biscuda) {
2127     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2128   }
2129   PetscFunctionReturn(0);
2130 }
2131 
2132 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2133 {
2134   Mat_Product        *product = C->product;
2135   Mat                A,B;
2136   PetscInt           m,n;
2137   PetscBool          cisdense,flg;
2138   PetscErrorCode     ierr;
2139   MatMatCusparse     *mmdata;
2140   Mat_SeqAIJCUSPARSE *cusp;
2141 
2142   PetscFunctionBegin;
2143   MatCheckProduct(C,1);
2144   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2145   A    = product->A;
2146   B    = product->B;
2147   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2148   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2149   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2150   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2151   switch (product->type) {
2152   case MATPRODUCT_AB:
2153     m = A->rmap->n;
2154     n = B->cmap->n;
2155     break;
2156   case MATPRODUCT_AtB:
2157     m = A->cmap->n;
2158     n = B->cmap->n;
2159     break;
2160   case MATPRODUCT_ABt:
2161     m = A->rmap->n;
2162     n = B->rmap->n;
2163     break;
2164   case MATPRODUCT_PtAP:
2165     m = B->cmap->n;
2166     n = B->cmap->n;
2167     break;
2168   case MATPRODUCT_RARt:
2169     m = B->rmap->n;
2170     n = B->rmap->n;
2171     break;
2172   default:
2173     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2174   }
2175   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2176   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2177   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2178   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2179 
2180   /* product data */
2181   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2182   mmdata->cisdense = cisdense;
2183  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2184   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2185   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2186     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2187   }
2188  #endif
2189   /* for these products we need intermediate storage */
2190   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2191     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2192     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2193     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2194       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2195     } else {
2196       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2197     }
2198   }
2199   C->product->data    = mmdata;
2200   C->product->destroy = MatDestroy_MatMatCusparse;
2201 
2202   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2203   PetscFunctionReturn(0);
2204 }
2205 
2206 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2207 {
2208   Mat_Product                  *product = C->product;
2209   Mat                          A,B;
2210   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2211   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2212   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2213   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2214   PetscBool                    flg;
2215   PetscErrorCode               ierr;
2216   cusparseStatus_t             stat;
2217   cudaError_t                  cerr;
2218   MatProductType               ptype;
2219   MatMatCusparse               *mmdata;
2220 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2221   cusparseSpMatDescr_t         BmatSpDescr;
2222 #endif
2223 
2224   PetscFunctionBegin;
2225   MatCheckProduct(C,1);
2226   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2227   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2228   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2229   mmdata = (MatMatCusparse*)C->product->data;
2230   A = product->A;
2231   B = product->B;
2232   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2233     mmdata->reusesym = PETSC_FALSE;
2234     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2235     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2236     Cmat = Ccusp->mat;
2237     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2238     Ccsr = (CsrMatrix*)Cmat->mat;
2239     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2240     goto finalize;
2241   }
2242   if (!c->nz) goto finalize;
2243   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2244   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2245   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2246   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2247   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2248   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2249   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2250   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2251   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2252   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2253   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2254   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2255   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2256   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2257 
2258   ptype = product->type;
2259   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2260   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2261   switch (ptype) {
2262   case MATPRODUCT_AB:
2263     Amat = Acusp->mat;
2264     Bmat = Bcusp->mat;
2265     break;
2266   case MATPRODUCT_AtB:
2267     Amat = Acusp->matTranspose;
2268     Bmat = Bcusp->mat;
2269     break;
2270   case MATPRODUCT_ABt:
2271     Amat = Acusp->mat;
2272     Bmat = Bcusp->matTranspose;
2273     break;
2274   default:
2275     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2276   }
2277   Cmat = Ccusp->mat;
2278   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2279   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2280   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2281   Acsr = (CsrMatrix*)Amat->mat;
2282   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2283   Ccsr = (CsrMatrix*)Cmat->mat;
2284   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2285   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2286   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2287   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2288 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2289   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2290   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2291                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2292                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2293                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2294   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2295                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2296                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2297 #else
2298   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2299                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2300                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2301                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2302                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2303 #endif
2304   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2305   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2306   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2307   C->offloadmask = PETSC_OFFLOAD_GPU;
2308 finalize:
2309   /* shorter version of MatAssemblyEnd_SeqAIJ */
2310   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2311   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2312   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2313   c->reallocs         = 0;
2314   C->info.mallocs    += 0;
2315   C->info.nz_unneeded = 0;
2316   C->assembled = C->was_assembled = PETSC_TRUE;
2317   C->num_ass++;
2318   PetscFunctionReturn(0);
2319 }
2320 
2321 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2322 {
2323   Mat_Product                  *product = C->product;
2324   Mat                          A,B;
2325   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2326   Mat_SeqAIJ                   *a,*b,*c;
2327   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2328   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2329   PetscInt                     i,j,m,n,k;
2330   PetscBool                    flg;
2331   PetscErrorCode               ierr;
2332   cusparseStatus_t             stat;
2333   cudaError_t                  cerr;
2334   MatProductType               ptype;
2335   MatMatCusparse               *mmdata;
2336   PetscLogDouble               flops;
2337   PetscBool                    biscompressed,ciscompressed;
2338 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2339   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2340   size_t                       bufSize2;
2341   cusparseSpMatDescr_t         BmatSpDescr;
2342 #else
2343   int                          cnz;
2344 #endif
2345 
2346   PetscFunctionBegin;
2347   MatCheckProduct(C,1);
2348   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2349   A    = product->A;
2350   B    = product->B;
2351   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2352   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2353   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2354   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2355   a = (Mat_SeqAIJ*)A->data;
2356   b = (Mat_SeqAIJ*)B->data;
2357   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2358   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2359   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2360   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2361 
2362   /* product data */
2363   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2364   C->product->data    = mmdata;
2365   C->product->destroy = MatDestroy_MatMatCusparse;
2366 
2367   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2368   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2369   ptype = product->type;
2370   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2371   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2372   biscompressed = PETSC_FALSE;
2373   ciscompressed = PETSC_FALSE;
2374   switch (ptype) {
2375   case MATPRODUCT_AB:
2376     m = A->rmap->n;
2377     n = B->cmap->n;
2378     k = A->cmap->n;
2379     Amat = Acusp->mat;
2380     Bmat = Bcusp->mat;
2381     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2382     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2383     break;
2384   case MATPRODUCT_AtB:
2385     m = A->cmap->n;
2386     n = B->cmap->n;
2387     k = A->rmap->n;
2388     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2389     Amat = Acusp->matTranspose;
2390     Bmat = Bcusp->mat;
2391     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2392     break;
2393   case MATPRODUCT_ABt:
2394     m = A->rmap->n;
2395     n = B->rmap->n;
2396     k = A->cmap->n;
2397     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2398     Amat = Acusp->mat;
2399     Bmat = Bcusp->matTranspose;
2400     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401     break;
2402   default:
2403     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2404   }
2405 
2406   /* create cusparse matrix */
2407   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2408   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2409   c     = (Mat_SeqAIJ*)C->data;
2410   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2411   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2412   Ccsr  = new CsrMatrix;
2413 
2414   c->compressedrow.use = ciscompressed;
2415   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2416     c->compressedrow.nrows = a->compressedrow.nrows;
2417     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2418     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2419     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2420     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2421     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2422   } else {
2423     c->compressedrow.nrows  = 0;
2424     c->compressedrow.i      = NULL;
2425     c->compressedrow.rindex = NULL;
2426     Ccusp->workVector       = NULL;
2427     Cmat->cprowIndices      = NULL;
2428   }
2429   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2430   Ccusp->mat      = Cmat;
2431   Ccusp->mat->mat = Ccsr;
2432   Ccsr->num_rows    = Ccusp->nrows;
2433   Ccsr->num_cols    = n;
2434   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2435   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2436   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2437   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2438   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2439   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2440   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2441   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2442   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2443   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2444   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2445     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2446     c->nz = 0;
2447     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2448     Ccsr->values = new THRUSTARRAY(c->nz);
2449     goto finalizesym;
2450   }
2451 
2452   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2453   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2454   Acsr = (CsrMatrix*)Amat->mat;
2455   if (!biscompressed) {
2456     Bcsr = (CsrMatrix*)Bmat->mat;
2457 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2458     BmatSpDescr = Bmat->matDescr;
2459 #endif
2460   } else { /* we need to use row offsets for the full matrix */
2461     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2462     Bcsr = new CsrMatrix;
2463     Bcsr->num_rows       = B->rmap->n;
2464     Bcsr->num_cols       = cBcsr->num_cols;
2465     Bcsr->num_entries    = cBcsr->num_entries;
2466     Bcsr->column_indices = cBcsr->column_indices;
2467     Bcsr->values         = cBcsr->values;
2468     if (!Bcusp->rowoffsets_gpu) {
2469       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2470       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2471       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2472     }
2473     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2474     mmdata->Bcsr = Bcsr;
2475 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2476     if (Bcsr->num_rows && Bcsr->num_cols) {
2477       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2478                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2479                                Bcsr->values->data().get(),
2480                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2481                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2482     }
2483     BmatSpDescr = mmdata->matSpBDescr;
2484 #endif
2485   }
2486   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2487   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2488   /* precompute flops count */
2489   if (ptype == MATPRODUCT_AB) {
2490     for (i=0, flops = 0; i<A->rmap->n; i++) {
2491       const PetscInt st = a->i[i];
2492       const PetscInt en = a->i[i+1];
2493       for (j=st; j<en; j++) {
2494         const PetscInt brow = a->j[j];
2495         flops += 2.*(b->i[brow+1] - b->i[brow]);
2496       }
2497     }
2498   } else if (ptype == MATPRODUCT_AtB) {
2499     for (i=0, flops = 0; i<A->rmap->n; i++) {
2500       const PetscInt anzi = a->i[i+1] - a->i[i];
2501       const PetscInt bnzi = b->i[i+1] - b->i[i];
2502       flops += (2.*anzi)*bnzi;
2503     }
2504   } else { /* TODO */
2505     flops = 0.;
2506   }
2507 
2508   mmdata->flops = flops;
2509   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2510 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2511   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2512   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2513                            NULL, NULL, NULL,
2514                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2515                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2516   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2517   /* ask bufferSize bytes for external memory */
2518   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2519                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2520                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2521                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2522   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2523   /* inspect the matrices A and B to understand the memory requirement for the next step */
2524   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2525                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2526                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2527                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2528   /* ask bufferSize again bytes for external memory */
2529   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2530                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2531                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2532                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2533   /* The CUSPARSE documentation is not clear, nor the API
2534      We need both buffers to perform the operations properly!
2535      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2536      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2537      is stored in the descriptor! What a messy API... */
2538   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2539   /* compute the intermediate product of A * B */
2540   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2541                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2542                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2543                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2544   /* get matrix C non-zero entries C_nnz1 */
2545   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2546   c->nz = (PetscInt) C_nnz1;
2547   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2548   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2549   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2550   Ccsr->values = new THRUSTARRAY(c->nz);
2551   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2552   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2553                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2554   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2557 #else
2558   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2559   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2561                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2562                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2563                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2564   c->nz = cnz;
2565   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2566   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2567   Ccsr->values = new THRUSTARRAY(c->nz);
2568   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569 
2570   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2571   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2572      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2573      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2574   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2575                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2576                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2577                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2578                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2579 #endif
2580   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2581   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2582   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2583 finalizesym:
2584   c->singlemalloc = PETSC_FALSE;
2585   c->free_a       = PETSC_TRUE;
2586   c->free_ij      = PETSC_TRUE;
2587   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2588   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2589   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2590     PetscInt *d_i = c->i;
2591     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2592     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2593     ii   = *Ccsr->row_offsets;
2594     jj   = *Ccsr->column_indices;
2595     if (ciscompressed) d_i = c->compressedrow.i;
2596     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2597     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2598   } else {
2599     PetscInt *d_i = c->i;
2600     if (ciscompressed) d_i = c->compressedrow.i;
2601     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2602     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2603   }
2604   if (ciscompressed) { /* need to expand host row offsets */
2605     PetscInt r = 0;
2606     c->i[0] = 0;
2607     for (k = 0; k < c->compressedrow.nrows; k++) {
2608       const PetscInt next = c->compressedrow.rindex[k];
2609       const PetscInt old = c->compressedrow.i[k];
2610       for (; r < next; r++) c->i[r+1] = old;
2611     }
2612     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2613   }
2614   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2615   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2616   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2617   c->maxnz = c->nz;
2618   c->nonzerorowcnt = 0;
2619   c->rmax = 0;
2620   for (k = 0; k < m; k++) {
2621     const PetscInt nn = c->i[k+1] - c->i[k];
2622     c->ilen[k] = c->imax[k] = nn;
2623     c->nonzerorowcnt += (PetscInt)!!nn;
2624     c->rmax = PetscMax(c->rmax,nn);
2625   }
2626   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2627   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2628   Ccsr->num_entries = c->nz;
2629 
2630   C->nonzerostate++;
2631   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2632   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2633   Ccusp->nonzerostate = C->nonzerostate;
2634   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2635   C->preallocated  = PETSC_TRUE;
2636   C->assembled     = PETSC_FALSE;
2637   C->was_assembled = PETSC_FALSE;
2638   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2639     mmdata->reusesym = PETSC_TRUE;
2640     C->offloadmask   = PETSC_OFFLOAD_GPU;
2641   }
2642   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2643   PetscFunctionReturn(0);
2644 }
2645 
2646 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2647 
2648 /* handles sparse or dense B */
2649 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2650 {
2651   Mat_Product    *product = mat->product;
2652   PetscErrorCode ierr;
2653   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2654 
2655   PetscFunctionBegin;
2656   MatCheckProduct(mat,1);
2657   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2658   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2659     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2660   }
2661   if (product->type == MATPRODUCT_ABC) {
2662     Ciscusp = PETSC_FALSE;
2663     if (!product->C->boundtocpu) {
2664       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2665     }
2666   }
2667   if (isdense) {
2668     switch (product->type) {
2669     case MATPRODUCT_AB:
2670     case MATPRODUCT_AtB:
2671     case MATPRODUCT_ABt:
2672     case MATPRODUCT_PtAP:
2673     case MATPRODUCT_RARt:
2674      if (product->A->boundtocpu) {
2675         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2676       } else {
2677         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2678       }
2679       break;
2680     case MATPRODUCT_ABC:
2681       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2682       break;
2683     default:
2684       break;
2685     }
2686   } else if (Biscusp && Ciscusp) {
2687     switch (product->type) {
2688     case MATPRODUCT_AB:
2689     case MATPRODUCT_AtB:
2690     case MATPRODUCT_ABt:
2691       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2692       break;
2693     case MATPRODUCT_PtAP:
2694     case MATPRODUCT_RARt:
2695     case MATPRODUCT_ABC:
2696       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2697       break;
2698     default:
2699       break;
2700     }
2701   } else { /* fallback for AIJ */
2702     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2703   }
2704   PetscFunctionReturn(0);
2705 }
2706 
2707 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2708 {
2709   PetscErrorCode ierr;
2710 
2711   PetscFunctionBegin;
2712   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2713   PetscFunctionReturn(0);
2714 }
2715 
2716 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2717 {
2718   PetscErrorCode ierr;
2719 
2720   PetscFunctionBegin;
2721   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2722   PetscFunctionReturn(0);
2723 }
2724 
2725 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2726 {
2727   PetscErrorCode ierr;
2728 
2729   PetscFunctionBegin;
2730   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2731   PetscFunctionReturn(0);
2732 }
2733 
2734 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2735 {
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 
2743 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2744 {
2745   PetscErrorCode ierr;
2746 
2747   PetscFunctionBegin;
2748   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2753 {
2754   int i = blockIdx.x*blockDim.x + threadIdx.x;
2755   if (i < n) y[idx[i]] += x[i];
2756 }
2757 
2758 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2759 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2760 {
2761   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2762   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2763   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2764   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2765   PetscErrorCode               ierr;
2766   cudaError_t                  cerr;
2767   cusparseStatus_t             stat;
2768   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2769   PetscBool                    compressed;
2770 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2771   PetscInt                     nx,ny;
2772 #endif
2773 
2774   PetscFunctionBegin;
2775   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2776   if (!a->nonzerorowcnt) {
2777     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2778     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2779     PetscFunctionReturn(0);
2780   }
2781   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2782   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2783   if (!trans) {
2784     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2785     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2786   } else {
2787     if (herm || !A->form_explicit_transpose) {
2788       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2789       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2790     } else {
2791       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2792       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2793     }
2794   }
2795   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2796   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2797 
2798   try {
2799     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2800     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2801     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2802 
2803     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2804     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2805       /* z = A x + beta y.
2806          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2807          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2808       */
2809       xptr = xarray;
2810       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2811       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2812      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2814           allocated to accommodate different uses. So we get the length info directly from mat.
2815        */
2816       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2818         nx = mat->num_cols;
2819         ny = mat->num_rows;
2820       }
2821      #endif
2822     } else {
2823       /* z = A^T x + beta y
2824          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2825          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2826        */
2827       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2828       dptr = zarray;
2829       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2830       if (compressed) { /* Scatter x to work vector */
2831         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2832         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2833                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2834                          VecCUDAEqualsReverse());
2835       }
2836      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2837       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2838         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2839         nx = mat->num_rows;
2840         ny = mat->num_cols;
2841       }
2842      #endif
2843     }
2844 
2845     /* csr_spmv does y = alpha op(A) x + beta y */
2846     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2847      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2848       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2849       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2850         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2851         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2852         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2853                                 matstruct->matDescr,
2854                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2855                                 matstruct->cuSpMV[opA].vecYDescr,
2856                                 cusparse_scalartype,
2857                                 cusparsestruct->spmvAlg,
2858                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2859         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2860 
2861         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2862       } else {
2863         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2864         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2865         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2866       }
2867 
2868       stat = cusparseSpMV(cusparsestruct->handle, opA,
2869                                matstruct->alpha_one,
2870                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2871                                matstruct->cuSpMV[opA].vecXDescr,
2872                                beta,
2873                                matstruct->cuSpMV[opA].vecYDescr,
2874                                cusparse_scalartype,
2875                                cusparsestruct->spmvAlg,
2876                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2877      #else
2878       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2879       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2880                                mat->num_rows, mat->num_cols,
2881                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2882                                mat->values->data().get(), mat->row_offsets->data().get(),
2883                                mat->column_indices->data().get(), xptr, beta,
2884                                dptr);CHKERRCUSPARSE(stat);
2885      #endif
2886     } else {
2887       if (cusparsestruct->nrows) {
2888        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2889         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2890        #else
2891         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2892         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2893                                  matstruct->alpha_one, matstruct->descr, hybMat,
2894                                  xptr, beta,
2895                                  dptr);CHKERRCUSPARSE(stat);
2896        #endif
2897       }
2898     }
2899     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2900     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2901 
2902     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2903       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2904         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2905           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2906         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2907           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2908         }
2909       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2910         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2911       }
2912 
2913       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2914       if (compressed) {
2915         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2916         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2917            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2918            prevent that. So I just add a ScatterAdd kernel.
2919          */
2920        #if 0
2921         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2922         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2923                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2924                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2925                          VecCUDAPlusEquals());
2926        #else
2927         PetscInt n = matstruct->cprowIndices->size();
2928         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2929        #endif
2930         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2931         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2932       }
2933     } else {
2934       if (yy && yy != zz) {
2935         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2936       }
2937     }
2938     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2939     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2940     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2941   } catch(char *ex) {
2942     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2943   }
2944   if (yy) {
2945     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2946   } else {
2947     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2948   }
2949   PetscFunctionReturn(0);
2950 }
2951 
2952 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2953 {
2954   PetscErrorCode ierr;
2955 
2956   PetscFunctionBegin;
2957   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2958   PetscFunctionReturn(0);
2959 }
2960 
2961 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2962 {
2963   PetscErrorCode              ierr;
2964   PetscSplitCSRDataStructure  *d_mat = NULL;
2965   PetscFunctionBegin;
2966   if (A->factortype == MAT_FACTOR_NONE) {
2967     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2968   }
2969   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2970   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2971   if (d_mat) {
2972     A->offloadmask = PETSC_OFFLOAD_GPU;
2973   }
2974 
2975   PetscFunctionReturn(0);
2976 }
2977 
2978 /* --------------------------------------------------------------------------------*/
2979 /*@
2980    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2981    (the default parallel PETSc format). This matrix will ultimately pushed down
2982    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2983    assembly performance the user should preallocate the matrix storage by setting
2984    the parameter nz (or the array nnz).  By setting these parameters accurately,
2985    performance during matrix assembly can be increased by more than a factor of 50.
2986 
2987    Collective
2988 
2989    Input Parameters:
2990 +  comm - MPI communicator, set to PETSC_COMM_SELF
2991 .  m - number of rows
2992 .  n - number of columns
2993 .  nz - number of nonzeros per row (same for all rows)
2994 -  nnz - array containing the number of nonzeros in the various rows
2995          (possibly different for each row) or NULL
2996 
2997    Output Parameter:
2998 .  A - the matrix
2999 
3000    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3001    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3002    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3003 
3004    Notes:
3005    If nnz is given then nz is ignored
3006 
3007    The AIJ format (also called the Yale sparse matrix format or
3008    compressed row storage), is fully compatible with standard Fortran 77
3009    storage.  That is, the stored row and column indices can begin at
3010    either one (as in Fortran) or zero.  See the users' manual for details.
3011 
3012    Specify the preallocated storage with either nz or nnz (not both).
3013    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3014    allocation.  For large problems you MUST preallocate memory or you
3015    will get TERRIBLE performance, see the users' manual chapter on matrices.
3016 
3017    By default, this format uses inodes (identical nodes) when possible, to
3018    improve numerical efficiency of matrix-vector products and solves. We
3019    search for consecutive rows with the same nonzero structure, thereby
3020    reusing matrix information to achieve increased efficiency.
3021 
3022    Level: intermediate
3023 
3024 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3025 @*/
3026 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3027 {
3028   PetscErrorCode ierr;
3029 
3030   PetscFunctionBegin;
3031   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3032   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3033   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3034   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3039 {
3040   PetscErrorCode              ierr;
3041   PetscSplitCSRDataStructure  *d_mat = NULL;
3042 
3043   PetscFunctionBegin;
3044   if (A->factortype == MAT_FACTOR_NONE) {
3045     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3046     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3047     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3048   } else {
3049     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3050   }
3051   if (d_mat) {
3052     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3053     cudaError_t                err;
3054     PetscSplitCSRDataStructure h_mat;
3055     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3056     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3057     if (a->compressedrow.use) {
3058       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3059     }
3060     err = cudaFree(d_mat);CHKERRCUDA(err);
3061   }
3062   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3063   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3064   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3065   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3066   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3067   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3068   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3069   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3070   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3071   PetscFunctionReturn(0);
3072 }
3073 
3074 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3075 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3076 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3077 {
3078   PetscErrorCode ierr;
3079 
3080   PetscFunctionBegin;
3081   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3082   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3087 {
3088   PetscErrorCode     ierr;
3089   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3090   Mat_SeqAIJCUSPARSE *cy;
3091   Mat_SeqAIJCUSPARSE *cx;
3092   PetscScalar        *ay;
3093   const PetscScalar  *ax;
3094   CsrMatrix          *csry,*csrx;
3095   cudaError_t        cerr;
3096 
3097   PetscFunctionBegin;
3098   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3099   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3100   if (X->ops->axpy != Y->ops->axpy) {
3101     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3102     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3103     PetscFunctionReturn(0);
3104   }
3105   /* if we are here, it means both matrices are bound to GPU */
3106   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3107   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3108   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3109   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3110   csry = (CsrMatrix*)cy->mat->mat;
3111   csrx = (CsrMatrix*)cx->mat->mat;
3112   /* see if we can turn this into a cublas axpy */
3113   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3114     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3115     if (eq) {
3116       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3117     }
3118     if (eq) str = SAME_NONZERO_PATTERN;
3119   }
3120   /* spgeam is buggy with one column */
3121   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3122 
3123   if (str == SUBSET_NONZERO_PATTERN) {
3124     cusparseStatus_t stat;
3125     PetscScalar      b = 1.0;
3126 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3127     size_t           bufferSize;
3128     void             *buffer;
3129 #endif
3130 
3131     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3132     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3133     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3134 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3135     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3136                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3137                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3138                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3139     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3140     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3141     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3142                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3143                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3144                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3145     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3146     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3147     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3148     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3149 #else
3150     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3151     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3152                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3153                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3154                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3155     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3156     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3157     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3158 #endif
3159     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3160     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3161     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3162     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3163   } else if (str == SAME_NONZERO_PATTERN) {
3164     cublasHandle_t cublasv2handle;
3165     cublasStatus_t berr;
3166     PetscBLASInt   one = 1, bnz = 1;
3167 
3168     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3169     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3170     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3171     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3172     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3173     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3174     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3175     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3176     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3177     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3178     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3179     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3180   } else {
3181     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3182     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3183   }
3184   PetscFunctionReturn(0);
3185 }
3186 
3187 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3188 {
3189   PetscErrorCode ierr;
3190   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3191   PetscScalar    *ay;
3192   cudaError_t    cerr;
3193   cublasHandle_t cublasv2handle;
3194   cublasStatus_t berr;
3195   PetscBLASInt   one = 1, bnz = 1;
3196 
3197   PetscFunctionBegin;
3198   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3199   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3200   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3201   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3202   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3203   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3204   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3205   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3206   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3207   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3208   PetscFunctionReturn(0);
3209 }
3210 
3211 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3212 {
3213   PetscErrorCode             ierr;
3214   PetscBool                  both = PETSC_FALSE;
3215   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3216 
3217   PetscFunctionBegin;
3218   if (A->factortype == MAT_FACTOR_NONE) {
3219     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3220     if (spptr->mat) {
3221       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3222       if (matrix->values) {
3223         both = PETSC_TRUE;
3224         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3225       }
3226     }
3227     if (spptr->matTranspose) {
3228       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3229       if (matrix->values) {
3230         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3231       }
3232     }
3233   }
3234   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3235   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3236   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3237   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3238   else A->offloadmask = PETSC_OFFLOAD_CPU;
3239 
3240   PetscFunctionReturn(0);
3241 }
3242 
3243 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3244 {
3245   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3246   PetscErrorCode ierr;
3247 
3248   PetscFunctionBegin;
3249   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3250   if (flg) {
3251     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3252 
3253     A->ops->scale                     = MatScale_SeqAIJ;
3254     A->ops->axpy                      = MatAXPY_SeqAIJ;
3255     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3256     A->ops->mult                      = MatMult_SeqAIJ;
3257     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3258     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3259     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3260     A->ops->multhermitiantranspose    = NULL;
3261     A->ops->multhermitiantransposeadd = NULL;
3262     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3263     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3264     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3265     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3266     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3267     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3270   } else {
3271     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3272     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3273     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3274     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3275     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3276     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3277     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3278     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3279     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3280     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3281     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3283     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3284     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3285     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288   }
3289   A->boundtocpu = flg;
3290   a->inode.use = flg;
3291   PetscFunctionReturn(0);
3292 }
3293 
3294 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3295 {
3296   PetscErrorCode   ierr;
3297   cusparseStatus_t stat;
3298   Mat              B;
3299 
3300   PetscFunctionBegin;
3301   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3302   if (reuse == MAT_INITIAL_MATRIX) {
3303     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3304   } else if (reuse == MAT_REUSE_MATRIX) {
3305     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3306   }
3307   B = *newmat;
3308 
3309   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3310   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3311 
3312   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3313     if (B->factortype == MAT_FACTOR_NONE) {
3314       Mat_SeqAIJCUSPARSE *spptr;
3315       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3316       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3317       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3318       spptr->format     = MAT_CUSPARSE_CSR;
3319      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3320       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3321       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3322       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3323      #endif
3324       B->spptr = spptr;
3325     } else {
3326       Mat_SeqAIJCUSPARSETriFactors *spptr;
3327 
3328       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3329       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3330       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3331       B->spptr = spptr;
3332     }
3333     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3334   }
3335   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3336   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3337   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3338   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3339   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3340   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3341 
3342   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3343   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3344   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3345   PetscFunctionReturn(0);
3346 }
3347 
3348 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3349 {
3350   PetscErrorCode ierr;
3351 
3352   PetscFunctionBegin;
3353   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3354   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3355   PetscFunctionReturn(0);
3356 }
3357 
3358 /*MC
3359    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3360 
3361    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3362    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3363    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3364 
3365    Options Database Keys:
3366 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3367 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3368 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3369 
3370   Level: beginner
3371 
3372 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3373 M*/
3374 
3375 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3376 
3377 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3378 {
3379   PetscErrorCode ierr;
3380 
3381   PetscFunctionBegin;
3382   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3383   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3384   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3385   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3386   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3387 
3388   PetscFunctionReturn(0);
3389 }
3390 
3391 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3392 {
3393   PetscErrorCode   ierr;
3394   cusparseStatus_t stat;
3395 
3396   PetscFunctionBegin;
3397   if (*cusparsestruct) {
3398     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3399     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3400     delete (*cusparsestruct)->workVector;
3401     delete (*cusparsestruct)->rowoffsets_gpu;
3402     delete (*cusparsestruct)->cooPerm;
3403     delete (*cusparsestruct)->cooPerm_a;
3404     delete (*cusparsestruct)->csr2csc_i;
3405     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3406     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3407   }
3408   PetscFunctionReturn(0);
3409 }
3410 
3411 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3412 {
3413   PetscFunctionBegin;
3414   if (*mat) {
3415     delete (*mat)->values;
3416     delete (*mat)->column_indices;
3417     delete (*mat)->row_offsets;
3418     delete *mat;
3419     *mat = 0;
3420   }
3421   PetscFunctionReturn(0);
3422 }
3423 
3424 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3425 {
3426   cusparseStatus_t stat;
3427   PetscErrorCode   ierr;
3428 
3429   PetscFunctionBegin;
3430   if (*trifactor) {
3431     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3432     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3433     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3434     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3435     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3436    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3437     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3438    #endif
3439     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3440   }
3441   PetscFunctionReturn(0);
3442 }
3443 
3444 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3445 {
3446   CsrMatrix        *mat;
3447   cusparseStatus_t stat;
3448   cudaError_t      err;
3449 
3450   PetscFunctionBegin;
3451   if (*matstruct) {
3452     if ((*matstruct)->mat) {
3453       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3454        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3455         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3456        #else
3457         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3458         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3459        #endif
3460       } else {
3461         mat = (CsrMatrix*)(*matstruct)->mat;
3462         CsrMatrix_Destroy(&mat);
3463       }
3464     }
3465     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3466     delete (*matstruct)->cprowIndices;
3467     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3468     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3469     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3470 
3471    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3473     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3474     for (int i=0; i<3; i++) {
3475       if (mdata->cuSpMV[i].initialized) {
3476         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3477         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3478         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3479       }
3480     }
3481    #endif
3482     delete *matstruct;
3483     *matstruct = NULL;
3484   }
3485   PetscFunctionReturn(0);
3486 }
3487 
3488 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3489 {
3490   PetscErrorCode ierr;
3491 
3492   PetscFunctionBegin;
3493   if (*trifactors) {
3494     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3495     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3496     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3497     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3498     delete (*trifactors)->rpermIndices;
3499     delete (*trifactors)->cpermIndices;
3500     delete (*trifactors)->workVector;
3501     (*trifactors)->rpermIndices = NULL;
3502     (*trifactors)->cpermIndices = NULL;
3503     (*trifactors)->workVector = NULL;
3504     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3505     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3506     (*trifactors)->init_dev_prop = PETSC_FALSE;
3507   }
3508   PetscFunctionReturn(0);
3509 }
3510 
3511 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3512 {
3513   PetscErrorCode   ierr;
3514   cusparseHandle_t handle;
3515   cusparseStatus_t stat;
3516 
3517   PetscFunctionBegin;
3518   if (*trifactors) {
3519     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3520     if (handle = (*trifactors)->handle) {
3521       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3522     }
3523     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3524   }
3525   PetscFunctionReturn(0);
3526 }
3527 
3528 struct IJCompare
3529 {
3530   __host__ __device__
3531   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3532   {
3533     if (t1.get<0>() < t2.get<0>()) return true;
3534     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3535     return false;
3536   }
3537 };
3538 
3539 struct IJEqual
3540 {
3541   __host__ __device__
3542   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3543   {
3544     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3545     return true;
3546   }
3547 };
3548 
3549 struct IJDiff
3550 {
3551   __host__ __device__
3552   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3553   {
3554     return t1 == t2 ? 0 : 1;
3555   }
3556 };
3557 
3558 struct IJSum
3559 {
3560   __host__ __device__
3561   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3562   {
3563     return t1||t2;
3564   }
3565 };
3566 
3567 #include <thrust/iterator/discard_iterator.h>
3568 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3569 {
3570   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3571   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3572   THRUSTARRAY                           *cooPerm_v = NULL;
3573   thrust::device_ptr<const PetscScalar> d_v;
3574   CsrMatrix                             *matrix;
3575   PetscErrorCode                        ierr;
3576   cudaError_t                           cerr;
3577   PetscInt                              n;
3578 
3579   PetscFunctionBegin;
3580   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3581   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3582   if (!cusp->cooPerm) {
3583     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3584     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3585     PetscFunctionReturn(0);
3586   }
3587   matrix = (CsrMatrix*)cusp->mat->mat;
3588   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3589   if (!v) {
3590     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3591     goto finalize;
3592   }
3593   n = cusp->cooPerm->size();
3594   if (isCudaMem(v)) {
3595     d_v = thrust::device_pointer_cast(v);
3596   } else {
3597     cooPerm_v = new THRUSTARRAY(n);
3598     cooPerm_v->assign(v,v+n);
3599     d_v = cooPerm_v->data();
3600     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3601   }
3602   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3603   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3604     if (cusp->cooPerm_a) {
3605       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3606       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3607       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3608       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3609       delete cooPerm_w;
3610     } else {
3611       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3612                                                                 matrix->values->begin()));
3613       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3614                                                                 matrix->values->end()));
3615       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3616     }
3617   } else {
3618     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3619       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3621     } else {
3622       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3623                                                                 matrix->values->begin()));
3624       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3625                                                                 matrix->values->end()));
3626       thrust::for_each(zibit,zieit,VecCUDAEquals());
3627     }
3628   }
3629   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3630   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3631 finalize:
3632   delete cooPerm_v;
3633   A->offloadmask = PETSC_OFFLOAD_GPU;
3634   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3635   /* shorter version of MatAssemblyEnd_SeqAIJ */
3636   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3637   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3638   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3639   a->reallocs         = 0;
3640   A->info.mallocs    += 0;
3641   A->info.nz_unneeded = 0;
3642   A->assembled = A->was_assembled = PETSC_TRUE;
3643   A->num_ass++;
3644   PetscFunctionReturn(0);
3645 }
3646 
3647 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3648 {
3649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3650   PetscErrorCode     ierr;
3651 
3652   PetscFunctionBegin;
3653   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3654   if (!cusp) PetscFunctionReturn(0);
3655   if (destroy) {
3656     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3657     delete cusp->csr2csc_i;
3658     cusp->csr2csc_i = NULL;
3659   }
3660   A->transupdated = PETSC_FALSE;
3661   PetscFunctionReturn(0);
3662 }
3663 
3664 #include <thrust/binary_search.h>
3665 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3666 {
3667   PetscErrorCode     ierr;
3668   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3669   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3670   PetscInt           cooPerm_n, nzr = 0;
3671   cudaError_t        cerr;
3672 
3673   PetscFunctionBegin;
3674   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3675   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3676   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3677   if (n != cooPerm_n) {
3678     delete cusp->cooPerm;
3679     delete cusp->cooPerm_a;
3680     cusp->cooPerm = NULL;
3681     cusp->cooPerm_a = NULL;
3682   }
3683   if (n) {
3684     THRUSTINTARRAY d_i(n);
3685     THRUSTINTARRAY d_j(n);
3686     THRUSTINTARRAY ii(A->rmap->n);
3687 
3688     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3689     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3690 
3691     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3692     d_i.assign(coo_i,coo_i+n);
3693     d_j.assign(coo_j,coo_j+n);
3694     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3695     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3696 
3697     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3698     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3699     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3700     *cusp->cooPerm_a = d_i;
3701     THRUSTINTARRAY w = d_j;
3702 
3703     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3704     if (nekey == ekey) { /* all entries are unique */
3705       delete cusp->cooPerm_a;
3706       cusp->cooPerm_a = NULL;
3707     } else { /* I couldn't come up with a more elegant algorithm */
3708       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3709       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3710       (*cusp->cooPerm_a)[0] = 0;
3711       w[0] = 0;
3712       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3713       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3714     }
3715     thrust::counting_iterator<PetscInt> search_begin(0);
3716     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3717                         search_begin, search_begin + A->rmap->n,
3718                         ii.begin());
3719     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3720     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3721 
3722     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3723     a->singlemalloc = PETSC_FALSE;
3724     a->free_a       = PETSC_TRUE;
3725     a->free_ij      = PETSC_TRUE;
3726     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3727     a->i[0] = 0;
3728     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3729     a->nz = a->maxnz = a->i[A->rmap->n];
3730     a->rmax = 0;
3731     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3732     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3733     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3734     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3735     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3736     for (PetscInt i = 0; i < A->rmap->n; i++) {
3737       const PetscInt nnzr = a->i[i+1] - a->i[i];
3738       nzr += (PetscInt)!!(nnzr);
3739       a->ilen[i] = a->imax[i] = nnzr;
3740       a->rmax = PetscMax(a->rmax,nnzr);
3741     }
3742     a->nonzerorowcnt = nzr;
3743     A->preallocated = PETSC_TRUE;
3744     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3745     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3746   } else {
3747     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3748   }
3749   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3750 
3751   /* We want to allocate the CUSPARSE struct for matvec now.
3752      The code is so convoluted now that I prefer to copy zeros */
3753   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3754   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3755   A->offloadmask = PETSC_OFFLOAD_CPU;
3756   A->nonzerostate++;
3757   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3758   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3759 
3760   A->assembled = PETSC_FALSE;
3761   A->was_assembled = PETSC_FALSE;
3762   PetscFunctionReturn(0);
3763 }
3764 
3765 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3766 {
3767   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3768   CsrMatrix          *csr;
3769   PetscErrorCode     ierr;
3770 
3771   PetscFunctionBegin;
3772   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3773   PetscValidPointer(a,2);
3774   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3775   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3776   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3777   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3778   csr = (CsrMatrix*)cusp->mat->mat;
3779   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780   *a = csr->values->data().get();
3781   PetscFunctionReturn(0);
3782 }
3783 
3784 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3785 {
3786   PetscFunctionBegin;
3787   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3788   PetscValidPointer(a,2);
3789   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3790   *a = NULL;
3791   PetscFunctionReturn(0);
3792 }
3793 
3794 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3795 {
3796   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3797   CsrMatrix          *csr;
3798   PetscErrorCode     ierr;
3799 
3800   PetscFunctionBegin;
3801   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3802   PetscValidPointer(a,2);
3803   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3804   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3805   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3806   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3807   csr = (CsrMatrix*)cusp->mat->mat;
3808   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3809   *a = csr->values->data().get();
3810   A->offloadmask = PETSC_OFFLOAD_GPU;
3811   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3812   PetscFunctionReturn(0);
3813 }
3814 
3815 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3816 {
3817   PetscErrorCode ierr;
3818 
3819   PetscFunctionBegin;
3820   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3821   PetscValidPointer(a,2);
3822   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3823   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3824   *a = NULL;
3825   PetscFunctionReturn(0);
3826 }
3827 
3828 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3829 {
3830   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831   CsrMatrix          *csr;
3832   PetscErrorCode     ierr;
3833 
3834   PetscFunctionBegin;
3835   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3836   PetscValidPointer(a,2);
3837   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3838   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3839   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3840   csr = (CsrMatrix*)cusp->mat->mat;
3841   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3842   *a = csr->values->data().get();
3843   A->offloadmask = PETSC_OFFLOAD_GPU;
3844   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3845   PetscFunctionReturn(0);
3846 }
3847 
3848 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3849 {
3850   PetscErrorCode ierr;
3851 
3852   PetscFunctionBegin;
3853   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3854   PetscValidPointer(a,2);
3855   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3856   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3857   *a = NULL;
3858   PetscFunctionReturn(0);
3859 }
3860 
3861 struct IJCompare4
3862 {
3863   __host__ __device__
3864   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3865   {
3866     if (t1.get<0>() < t2.get<0>()) return true;
3867     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3868     return false;
3869   }
3870 };
3871 
3872 struct Shift
3873 {
3874   int _shift;
3875 
3876   Shift(int shift) : _shift(shift) {}
3877   __host__ __device__
3878   inline int operator() (const int &c)
3879   {
3880     return c + _shift;
3881   }
3882 };
3883 
3884 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3885 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3886 {
3887   PetscErrorCode               ierr;
3888   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3889   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3890   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3891   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3892   PetscInt                     Annz,Bnnz;
3893   cusparseStatus_t             stat;
3894   PetscInt                     i,m,n,zero = 0;
3895   cudaError_t                  cerr;
3896 
3897   PetscFunctionBegin;
3898   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3899   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3900   PetscValidPointer(C,4);
3901   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3902   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3903   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3904   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3905   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3906   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3907   if (reuse == MAT_INITIAL_MATRIX) {
3908     m     = A->rmap->n;
3909     n     = A->cmap->n + B->cmap->n;
3910     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3911     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3912     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3913     c     = (Mat_SeqAIJ*)(*C)->data;
3914     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3915     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3916     Ccsr  = new CsrMatrix;
3917     Cmat->cprowIndices      = NULL;
3918     c->compressedrow.use    = PETSC_FALSE;
3919     c->compressedrow.nrows  = 0;
3920     c->compressedrow.i      = NULL;
3921     c->compressedrow.rindex = NULL;
3922     Ccusp->workVector       = NULL;
3923     Ccusp->nrows    = m;
3924     Ccusp->mat      = Cmat;
3925     Ccusp->mat->mat = Ccsr;
3926     Ccsr->num_rows  = m;
3927     Ccsr->num_cols  = n;
3928     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3929     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3930     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3931     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3932     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3933     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3934     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3937     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3938     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3939     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3940     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3941     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3942     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3943 
3944     Acsr = (CsrMatrix*)Acusp->mat->mat;
3945     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3946     Annz = (PetscInt)Acsr->column_indices->size();
3947     Bnnz = (PetscInt)Bcsr->column_indices->size();
3948     c->nz = Annz + Bnnz;
3949     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3950     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3951     Ccsr->values = new THRUSTARRAY(c->nz);
3952     Ccsr->num_entries = c->nz;
3953     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3954     if (c->nz) {
3955       auto Acoo = new THRUSTINTARRAY32(Annz);
3956       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3957       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3958       THRUSTINTARRAY32 *Aroff,*Broff;
3959 
3960       if (a->compressedrow.use) { /* need full row offset */
3961         if (!Acusp->rowoffsets_gpu) {
3962           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3963           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3964           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3965         }
3966         Aroff = Acusp->rowoffsets_gpu;
3967       } else Aroff = Acsr->row_offsets;
3968       if (b->compressedrow.use) { /* need full row offset */
3969         if (!Bcusp->rowoffsets_gpu) {
3970           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3971           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3972           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3973         }
3974         Broff = Bcusp->rowoffsets_gpu;
3975       } else Broff = Bcsr->row_offsets;
3976       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3977       stat = cusparseXcsr2coo(Acusp->handle,
3978                               Aroff->data().get(),
3979                               Annz,
3980                               m,
3981                               Acoo->data().get(),
3982                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983       stat = cusparseXcsr2coo(Bcusp->handle,
3984                               Broff->data().get(),
3985                               Bnnz,
3986                               m,
3987                               Bcoo->data().get(),
3988                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3989       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3990       auto Aperm = thrust::make_constant_iterator(1);
3991       auto Bperm = thrust::make_constant_iterator(0);
3992 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3993       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3994       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3995 #else
3996       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3997       auto Bcib = Bcsr->column_indices->begin();
3998       auto Bcie = Bcsr->column_indices->end();
3999       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4000 #endif
4001       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4002       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4003       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4004       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4005       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4006       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4007       auto p1 = Ccusp->cooPerm->begin();
4008       auto p2 = Ccusp->cooPerm->begin();
4009       thrust::advance(p2,Annz);
4010       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4011 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4012       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4013 #endif
4014       auto cci = thrust::make_counting_iterator(zero);
4015       auto cce = thrust::make_counting_iterator(c->nz);
4016 #if 0 //Errors on SUMMIT cuda 11.1.0
4017       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4018 #else
4019       auto pred = thrust::identity<int>();
4020       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4021       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4022 #endif
4023       stat = cusparseXcoo2csr(Ccusp->handle,
4024                               Ccoo->data().get(),
4025                               c->nz,
4026                               m,
4027                               Ccsr->row_offsets->data().get(),
4028                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4029       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4030       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4031       delete wPerm;
4032       delete Acoo;
4033       delete Bcoo;
4034       delete Ccoo;
4035 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4036       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4037                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4038                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4039                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4040 #endif
4041       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4042         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4043         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4044         CsrMatrix *CcsrT = new CsrMatrix;
4045         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4046         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4047 
4048         (*C)->form_explicit_transpose = PETSC_TRUE;
4049         (*C)->transupdated = PETSC_TRUE;
4050         Ccusp->rowoffsets_gpu = NULL;
4051         CmatT->cprowIndices = NULL;
4052         CmatT->mat = CcsrT;
4053         CcsrT->num_rows = n;
4054         CcsrT->num_cols = m;
4055         CcsrT->num_entries = c->nz;
4056 
4057         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4058         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4059         CcsrT->values = new THRUSTARRAY(c->nz);
4060 
4061         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4062         auto rT = CcsrT->row_offsets->begin();
4063         if (AT) {
4064           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4065           thrust::advance(rT,-1);
4066         }
4067         if (BT) {
4068           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4069           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4070           thrust::copy(titb,tite,rT);
4071         }
4072         auto cT = CcsrT->column_indices->begin();
4073         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4074         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4075         auto vT = CcsrT->values->begin();
4076         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4077         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4078         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4079         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4080 
4081         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4082         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4083         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4084         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4085         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4086         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4087         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4088         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4091         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4092                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4093                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4094                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4095 #endif
4096         Ccusp->matTranspose = CmatT;
4097       }
4098     }
4099 
4100     c->singlemalloc = PETSC_FALSE;
4101     c->free_a       = PETSC_TRUE;
4102     c->free_ij      = PETSC_TRUE;
4103     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4104     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4105     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4106       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4107       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4108       ii   = *Ccsr->row_offsets;
4109       jj   = *Ccsr->column_indices;
4110       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4111       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4112     } else {
4113       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4114       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4115     }
4116     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4117     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4118     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4119     c->maxnz = c->nz;
4120     c->nonzerorowcnt = 0;
4121     c->rmax = 0;
4122     for (i = 0; i < m; i++) {
4123       const PetscInt nn = c->i[i+1] - c->i[i];
4124       c->ilen[i] = c->imax[i] = nn;
4125       c->nonzerorowcnt += (PetscInt)!!nn;
4126       c->rmax = PetscMax(c->rmax,nn);
4127     }
4128     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4129     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4130     (*C)->nonzerostate++;
4131     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4132     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4133     Ccusp->nonzerostate = (*C)->nonzerostate;
4134     (*C)->preallocated  = PETSC_TRUE;
4135   } else {
4136     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4137     c = (Mat_SeqAIJ*)(*C)->data;
4138     if (c->nz) {
4139       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4140       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4141       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4142       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4143       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4144       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4145       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4146       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4147       Acsr = (CsrMatrix*)Acusp->mat->mat;
4148       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4149       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4150       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4151       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4152       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4153       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4154       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4155       auto pmid = Ccusp->cooPerm->begin();
4156       thrust::advance(pmid,Acsr->num_entries);
4157       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4158       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4159                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4160       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4161                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4162       thrust::for_each(zibait,zieait,VecCUDAEquals());
4163       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4164                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4165       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4166                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4167       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4168       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4169       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4170         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4171         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4172         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4173         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4174         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4175         auto vT = CcsrT->values->begin();
4176         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4177         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4178         (*C)->transupdated = PETSC_TRUE;
4179       }
4180       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4181       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4182     }
4183   }
4184   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4185   (*C)->assembled     = PETSC_TRUE;
4186   (*C)->was_assembled = PETSC_FALSE;
4187   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4188   PetscFunctionReturn(0);
4189 }
4190 
4191 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4192 {
4193   PetscErrorCode    ierr;
4194   bool              dmem;
4195   const PetscScalar *av;
4196   cudaError_t       cerr;
4197 
4198   PetscFunctionBegin;
4199   dmem = isCudaMem(v);
4200   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4201   if (n && idx) {
4202     THRUSTINTARRAY widx(n);
4203     widx.assign(idx,idx+n);
4204     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4205 
4206     THRUSTARRAY *w = NULL;
4207     thrust::device_ptr<PetscScalar> dv;
4208     if (dmem) {
4209       dv = thrust::device_pointer_cast(v);
4210     } else {
4211       w = new THRUSTARRAY(n);
4212       dv = w->data();
4213     }
4214     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4215 
4216     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4217     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4218     thrust::for_each(zibit,zieit,VecCUDAEquals());
4219     if (w) {
4220       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4221     }
4222     delete w;
4223   } else {
4224     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4225   }
4226   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4227   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4228   PetscFunctionReturn(0);
4229 }
4230