xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b4f26c06fff0db25fc6fd5b778f99b5b9671a638)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     if (!A->boundtocpu) {
169       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
170       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171     } else {
172       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
173       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
174     }
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179     if (!A->boundtocpu) {
180       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
182     } else {
183       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
184       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
185     }
186     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
187     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
188   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189 
190   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
191   (*B)->canuseordering = PETSC_TRUE;
192   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
193   PetscFunctionReturn(0);
194 }
195 
196 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197 {
198   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
199 
200   PetscFunctionBegin;
201   switch (op) {
202   case MAT_CUSPARSE_MULT:
203     cusparsestruct->format = format;
204     break;
205   case MAT_CUSPARSE_ALL:
206     cusparsestruct->format = format;
207     break;
208   default:
209     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210   }
211   PetscFunctionReturn(0);
212 }
213 
214 /*@
215    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216    operation. Only the MatMult operation can use different GPU storage formats
217    for MPIAIJCUSPARSE matrices.
218    Not Collective
219 
220    Input Parameters:
221 +  A - Matrix of type SEQAIJCUSPARSE
222 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
223 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224 
225    Output Parameter:
226 
227    Level: intermediate
228 
229 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230 @*/
231 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232 {
233   PetscErrorCode ierr;
234 
235   PetscFunctionBegin;
236   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238   PetscFunctionReturn(0);
239 }
240 
241 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
242 {
243   PetscErrorCode ierr;
244 
245   PetscFunctionBegin;
246   switch (op) {
247     case MAT_FORM_EXPLICIT_TRANSPOSE:
248       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
249       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
250       A->form_explicit_transpose = flg;
251       break;
252     default:
253       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
254       break;
255   }
256   PetscFunctionReturn(0);
257 }
258 
259 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
260 
261 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
262 {
263   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
264   IS             isrow = b->row,iscol = b->col;
265   PetscBool      row_identity,col_identity;
266   PetscErrorCode ierr;
267 
268   PetscFunctionBegin;
269   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
270   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
271   B->offloadmask = PETSC_OFFLOAD_CPU;
272   /* determine which version of MatSolve needs to be used. */
273   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
274   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
275   if (row_identity && col_identity) {
276     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
277     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
278     B->ops->matsolve = NULL;
279     B->ops->matsolvetranspose = NULL;
280   } else {
281     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
282     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
283     B->ops->matsolve = NULL;
284     B->ops->matsolvetranspose = NULL;
285   }
286 
287   /* get the triangular factors */
288   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
293 {
294   PetscErrorCode           ierr;
295   MatCUSPARSEStorageFormat format;
296   PetscBool                flg;
297   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
298 
299   PetscFunctionBegin;
300   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
301   if (A->factortype == MAT_FACTOR_NONE) {
302     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
303                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
304     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
305 
306     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
307                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
308     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
309    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
310     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
311                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
312     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
313 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
314     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315 #else
316     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
317 #endif
318     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
319                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
320     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
321 
322     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
323                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
324     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
325    #endif
326   }
327   ierr = PetscOptionsTail();CHKERRQ(ierr);
328   PetscFunctionReturn(0);
329 }
330 
331 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
332 {
333   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334   PetscErrorCode               ierr;
335 
336   PetscFunctionBegin;
337   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
339   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
340   PetscFunctionReturn(0);
341 }
342 
343 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
344 {
345   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
346   PetscErrorCode               ierr;
347 
348   PetscFunctionBegin;
349   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
350   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
351   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
352   PetscFunctionReturn(0);
353 }
354 
355 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
356 {
357   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
358   PetscErrorCode               ierr;
359 
360   PetscFunctionBegin;
361   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
362   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
363   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
364   PetscFunctionReturn(0);
365 }
366 
367 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
368 {
369   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
370   PetscErrorCode               ierr;
371 
372   PetscFunctionBegin;
373   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
374   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
375   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
376   PetscFunctionReturn(0);
377 }
378 
379 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
380 {
381   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
382   PetscInt                          n = A->rmap->n;
383   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
384   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
385   cusparseStatus_t                  stat;
386   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
387   const MatScalar                   *aa = a->a,*v;
388   PetscInt                          *AiLo, *AjLo;
389   PetscInt                          i,nz, nzLower, offset, rowOffset;
390   PetscErrorCode                    ierr;
391   cudaError_t                       cerr;
392 
393   PetscFunctionBegin;
394   if (!n) PetscFunctionReturn(0);
395   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
396     try {
397       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
398       nzLower=n+ai[n]-ai[1];
399       if (!loTriFactor) {
400         PetscScalar                       *AALo;
401 
402         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
403 
404         /* Allocate Space for the lower triangular matrix */
405         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
406         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
407 
408         /* Fill the lower triangular matrix */
409         AiLo[0]  = (PetscInt) 0;
410         AiLo[n]  = nzLower;
411         AjLo[0]  = (PetscInt) 0;
412         AALo[0]  = (MatScalar) 1.0;
413         v        = aa;
414         vi       = aj;
415         offset   = 1;
416         rowOffset= 1;
417         for (i=1; i<n; i++) {
418           nz = ai[i+1] - ai[i];
419           /* additional 1 for the term on the diagonal */
420           AiLo[i]    = rowOffset;
421           rowOffset += nz+1;
422 
423           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
424           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
425 
426           offset      += nz;
427           AjLo[offset] = (PetscInt) i;
428           AALo[offset] = (MatScalar) 1.0;
429           offset      += 1;
430 
431           v  += nz;
432           vi += nz;
433         }
434 
435         /* allocate space for the triangular factor information */
436         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
437         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
438         /* Create the matrix description */
439         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
440         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
441        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
443        #else
444         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
445        #endif
446         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
447         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
448 
449         /* set the operation */
450         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
451 
452         /* set the matrix */
453         loTriFactor->csrMat = new CsrMatrix;
454         loTriFactor->csrMat->num_rows = n;
455         loTriFactor->csrMat->num_cols = n;
456         loTriFactor->csrMat->num_entries = nzLower;
457 
458         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
459         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
460 
461         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
462         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
463 
464         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
465         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
466 
467         /* Create the solve analysis information */
468         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
469         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
470       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
472                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
475                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
476         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
477       #endif
478 
479         /* perform the solve analysis */
480         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
481                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
482                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
483                                  loTriFactor->csrMat->column_indices->data().get(),
484                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
485                                  loTriFactor->solveInfo,
486                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
487                                #else
488                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
489                                #endif
490         cerr = WaitForCUDA();CHKERRCUDA(cerr);
491         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
492 
493         /* assign the pointer */
494         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
495         loTriFactor->AA_h = AALo;
496         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
497         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
498         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
499       } else { /* update values only */
500         if (!loTriFactor->AA_h) {
501           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
502         }
503         /* Fill the lower triangular matrix */
504         loTriFactor->AA_h[0]  = 1.0;
505         v        = aa;
506         vi       = aj;
507         offset   = 1;
508         for (i=1; i<n; i++) {
509           nz = ai[i+1] - ai[i];
510           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
511           offset      += nz;
512           loTriFactor->AA_h[offset] = 1.0;
513           offset      += 1;
514           v  += nz;
515         }
516         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
517         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
518       }
519     } catch(char *ex) {
520       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
521     }
522   }
523   PetscFunctionReturn(0);
524 }
525 
526 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
527 {
528   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
529   PetscInt                          n = A->rmap->n;
530   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
531   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
532   cusparseStatus_t                  stat;
533   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
534   const MatScalar                   *aa = a->a,*v;
535   PetscInt                          *AiUp, *AjUp;
536   PetscInt                          i,nz, nzUpper, offset;
537   PetscErrorCode                    ierr;
538   cudaError_t                       cerr;
539 
540   PetscFunctionBegin;
541   if (!n) PetscFunctionReturn(0);
542   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
543     try {
544       /* next, figure out the number of nonzeros in the upper triangular matrix. */
545       nzUpper = adiag[0]-adiag[n];
546       if (!upTriFactor) {
547         PetscScalar *AAUp;
548 
549         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
550 
551         /* Allocate Space for the upper triangular matrix */
552         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
553         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
554 
555         /* Fill the upper triangular matrix */
556         AiUp[0]=(PetscInt) 0;
557         AiUp[n]=nzUpper;
558         offset = nzUpper;
559         for (i=n-1; i>=0; i--) {
560           v  = aa + adiag[i+1] + 1;
561           vi = aj + adiag[i+1] + 1;
562 
563           /* number of elements NOT on the diagonal */
564           nz = adiag[i] - adiag[i+1]-1;
565 
566           /* decrement the offset */
567           offset -= (nz+1);
568 
569           /* first, set the diagonal elements */
570           AjUp[offset] = (PetscInt) i;
571           AAUp[offset] = (MatScalar)1./v[nz];
572           AiUp[i]      = AiUp[i+1] - (nz+1);
573 
574           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
575           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
576         }
577 
578         /* allocate space for the triangular factor information */
579         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
580         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
581 
582         /* Create the matrix description */
583         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
584         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
585        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
587        #else
588         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
589        #endif
590         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
591         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592 
593         /* set the operation */
594         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
595 
596         /* set the matrix */
597         upTriFactor->csrMat = new CsrMatrix;
598         upTriFactor->csrMat->num_rows = n;
599         upTriFactor->csrMat->num_cols = n;
600         upTriFactor->csrMat->num_entries = nzUpper;
601 
602         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
603         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
604 
605         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
606         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
607 
608         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
609         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
610 
611         /* Create the solve analysis information */
612         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
613         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
614       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
616                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
617                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
618                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
619                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
620         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
621       #endif
622 
623         /* perform the solve analysis */
624         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
625                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
626                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
627                                  upTriFactor->csrMat->column_indices->data().get(),
628                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
629                                  upTriFactor->solveInfo,
630                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
631                                #else
632                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
633                                #endif
634         cerr = WaitForCUDA();CHKERRCUDA(cerr);
635         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
636 
637         /* assign the pointer */
638         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
639         upTriFactor->AA_h = AAUp;
640         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
641         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
642         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
643       } else {
644         if (!upTriFactor->AA_h) {
645           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
646         }
647         /* Fill the upper triangular matrix */
648         offset = nzUpper;
649         for (i=n-1; i>=0; i--) {
650           v  = aa + adiag[i+1] + 1;
651 
652           /* number of elements NOT on the diagonal */
653           nz = adiag[i] - adiag[i+1]-1;
654 
655           /* decrement the offset */
656           offset -= (nz+1);
657 
658           /* first, set the diagonal elements */
659           upTriFactor->AA_h[offset] = 1./v[nz];
660           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
661         }
662         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
663         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
664       }
665     } catch(char *ex) {
666       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
667     }
668   }
669   PetscFunctionReturn(0);
670 }
671 
672 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
673 {
674   PetscErrorCode               ierr;
675   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
676   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
677   IS                           isrow = a->row,iscol = a->icol;
678   PetscBool                    row_identity,col_identity;
679   PetscInt                     n = A->rmap->n;
680 
681   PetscFunctionBegin;
682   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
683   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
684   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
685 
686   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
687   cusparseTriFactors->nnz=a->nz;
688 
689   A->offloadmask = PETSC_OFFLOAD_BOTH;
690   /* lower triangular indices */
691   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
692   if (!row_identity && !cusparseTriFactors->rpermIndices) {
693     const PetscInt *r;
694 
695     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
696     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
697     cusparseTriFactors->rpermIndices->assign(r, r+n);
698     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
699     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
700   }
701 
702   /* upper triangular indices */
703   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
704   if (!col_identity && !cusparseTriFactors->cpermIndices) {
705     const PetscInt *c;
706 
707     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
708     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
709     cusparseTriFactors->cpermIndices->assign(c, c+n);
710     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
711     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
712   }
713   PetscFunctionReturn(0);
714 }
715 
716 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
717 {
718   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
719   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
720   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
721   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
722   cusparseStatus_t                  stat;
723   PetscErrorCode                    ierr;
724   cudaError_t                       cerr;
725   PetscInt                          *AiUp, *AjUp;
726   PetscScalar                       *AAUp;
727   PetscScalar                       *AALo;
728   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
729   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
730   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
731   const MatScalar                   *aa = b->a,*v;
732 
733   PetscFunctionBegin;
734   if (!n) PetscFunctionReturn(0);
735   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
736     try {
737       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
738       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
739       if (!upTriFactor && !loTriFactor) {
740         /* Allocate Space for the upper triangular matrix */
741         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
742         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
743 
744         /* Fill the upper triangular matrix */
745         AiUp[0]=(PetscInt) 0;
746         AiUp[n]=nzUpper;
747         offset = 0;
748         for (i=0; i<n; i++) {
749           /* set the pointers */
750           v  = aa + ai[i];
751           vj = aj + ai[i];
752           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
753 
754           /* first, set the diagonal elements */
755           AjUp[offset] = (PetscInt) i;
756           AAUp[offset] = (MatScalar)1.0/v[nz];
757           AiUp[i]      = offset;
758           AALo[offset] = (MatScalar)1.0/v[nz];
759 
760           offset+=1;
761           if (nz>0) {
762             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
763             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
764             for (j=offset; j<offset+nz; j++) {
765               AAUp[j] = -AAUp[j];
766               AALo[j] = AAUp[j]/v[nz];
767             }
768             offset+=nz;
769           }
770         }
771 
772         /* allocate space for the triangular factor information */
773         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
774         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
775 
776         /* Create the matrix description */
777         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
778         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
779        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
780         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
781        #else
782         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
783        #endif
784         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
785         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
786 
787         /* set the matrix */
788         upTriFactor->csrMat = new CsrMatrix;
789         upTriFactor->csrMat->num_rows = A->rmap->n;
790         upTriFactor->csrMat->num_cols = A->cmap->n;
791         upTriFactor->csrMat->num_entries = a->nz;
792 
793         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
794         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
795 
796         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
797         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
798 
799         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
800         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
801 
802         /* set the operation */
803         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
804 
805         /* Create the solve analysis information */
806         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
807         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
808       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
810                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
811                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
812                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
813                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
814         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
815       #endif
816 
817         /* perform the solve analysis */
818         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
819                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
820                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
821                                  upTriFactor->csrMat->column_indices->data().get(),
822                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823                                  upTriFactor->solveInfo,
824                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
825                                 #else
826                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
827                                 #endif
828         cerr = WaitForCUDA();CHKERRCUDA(cerr);
829         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
830 
831         /* assign the pointer */
832         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
833 
834         /* allocate space for the triangular factor information */
835         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
836         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
837 
838         /* Create the matrix description */
839         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
840         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
841        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
842         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
843        #else
844         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
845        #endif
846         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
847         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
848 
849         /* set the operation */
850         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
851 
852         /* set the matrix */
853         loTriFactor->csrMat = new CsrMatrix;
854         loTriFactor->csrMat->num_rows = A->rmap->n;
855         loTriFactor->csrMat->num_cols = A->cmap->n;
856         loTriFactor->csrMat->num_entries = a->nz;
857 
858         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
859         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
860 
861         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
862         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
863 
864         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
865         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
866 
867         /* Create the solve analysis information */
868         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
869         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
870       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
872                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
873                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
874                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
875                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
876         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
877       #endif
878 
879         /* perform the solve analysis */
880         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
881                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
882                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
883                                  loTriFactor->csrMat->column_indices->data().get(),
884                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
885                                  loTriFactor->solveInfo,
886                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
887                                 #else
888                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
889                                 #endif
890         cerr = WaitForCUDA();CHKERRCUDA(cerr);
891         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
892 
893         /* assign the pointer */
894         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
895 
896         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
897         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
898         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
899       } else {
900         /* Fill the upper triangular matrix */
901         offset = 0;
902         for (i=0; i<n; i++) {
903           /* set the pointers */
904           v  = aa + ai[i];
905           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
906 
907           /* first, set the diagonal elements */
908           AAUp[offset] = 1.0/v[nz];
909           AALo[offset] = 1.0/v[nz];
910 
911           offset+=1;
912           if (nz>0) {
913             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
914             for (j=offset; j<offset+nz; j++) {
915               AAUp[j] = -AAUp[j];
916               AALo[j] = AAUp[j]/v[nz];
917             }
918             offset+=nz;
919           }
920         }
921         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
922         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
923         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
924         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
925         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
926       }
927       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
928       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
929     } catch(char *ex) {
930       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
931     }
932   }
933   PetscFunctionReturn(0);
934 }
935 
936 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
937 {
938   PetscErrorCode               ierr;
939   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
940   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
941   IS                           ip = a->row;
942   PetscBool                    perm_identity;
943   PetscInt                     n = A->rmap->n;
944 
945   PetscFunctionBegin;
946   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
948   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
949   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
950 
951   A->offloadmask = PETSC_OFFLOAD_BOTH;
952 
953   /* lower triangular indices */
954   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
955   if (!perm_identity) {
956     IS             iip;
957     const PetscInt *irip,*rip;
958 
959     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
960     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
961     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
962     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
963     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
964     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
965     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
966     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
967     ierr = ISDestroy(&iip);CHKERRQ(ierr);
968     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
969     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
970   }
971   PetscFunctionReturn(0);
972 }
973 
974 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
975 {
976   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
977   IS             ip = b->row;
978   PetscBool      perm_identity;
979   PetscErrorCode ierr;
980 
981   PetscFunctionBegin;
982   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
983   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
984   B->offloadmask = PETSC_OFFLOAD_CPU;
985   /* determine which version of MatSolve needs to be used. */
986   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
987   if (perm_identity) {
988     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
989     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
990     B->ops->matsolve = NULL;
991     B->ops->matsolvetranspose = NULL;
992   } else {
993     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
994     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
995     B->ops->matsolve = NULL;
996     B->ops->matsolvetranspose = NULL;
997   }
998 
999   /* get the triangular factors */
1000   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1005 {
1006   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1007   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1008   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1009   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1010   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1011   cusparseStatus_t                  stat;
1012   cusparseIndexBase_t               indexBase;
1013   cusparseMatrixType_t              matrixType;
1014   cusparseFillMode_t                fillMode;
1015   cusparseDiagType_t                diagType;
1016   cudaError_t                       cerr;
1017   PetscErrorCode                    ierr;
1018 
1019   PetscFunctionBegin;
1020   /* allocate space for the transpose of the lower triangular factor */
1021   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1022   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1023 
1024   /* set the matrix descriptors of the lower triangular factor */
1025   matrixType = cusparseGetMatType(loTriFactor->descr);
1026   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1027   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1028     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1029   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1030 
1031   /* Create the matrix description */
1032   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1033   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1034   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1035   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1036   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1037 
1038   /* set the operation */
1039   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1040 
1041   /* allocate GPU space for the CSC of the lower triangular factor*/
1042   loTriFactorT->csrMat = new CsrMatrix;
1043   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1044   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1045   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1046   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1047   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1048   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1049 
1050   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1051 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1053                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1054                                        loTriFactor->csrMat->values->data().get(),
1055                                        loTriFactor->csrMat->row_offsets->data().get(),
1056                                        loTriFactor->csrMat->column_indices->data().get(),
1057                                        loTriFactorT->csrMat->values->data().get(),
1058                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1060                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1061   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1062 #endif
1063 
1064   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1065   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1066                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1067                           loTriFactor->csrMat->values->data().get(),
1068                           loTriFactor->csrMat->row_offsets->data().get(),
1069                           loTriFactor->csrMat->column_indices->data().get(),
1070                           loTriFactorT->csrMat->values->data().get(),
1071                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1072                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1073                           CUSPARSE_ACTION_NUMERIC, indexBase,
1074                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1075                         #else
1076                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1078                         #endif
1079   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1080   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1081 
1082   /* Create the solve analysis information */
1083   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1085 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1087                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1088                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1089                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1090                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1091   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1092 #endif
1093 
1094   /* perform the solve analysis */
1095   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1097                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1098                            loTriFactorT->csrMat->column_indices->data().get(),
1099                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1100                            loTriFactorT->solveInfo,
1101                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1102                           #else
1103                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1104                           #endif
1105   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1106   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1107 
1108   /* assign the pointer */
1109   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1110 
1111   /*********************************************/
1112   /* Now the Transpose of the Upper Tri Factor */
1113   /*********************************************/
1114 
1115   /* allocate space for the transpose of the upper triangular factor */
1116   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1117   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1118 
1119   /* set the matrix descriptors of the upper triangular factor */
1120   matrixType = cusparseGetMatType(upTriFactor->descr);
1121   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1122   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1123     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1124   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1125 
1126   /* Create the matrix description */
1127   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1128   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1129   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1130   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1131   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1132 
1133   /* set the operation */
1134   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1135 
1136   /* allocate GPU space for the CSC of the upper triangular factor*/
1137   upTriFactorT->csrMat = new CsrMatrix;
1138   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1139   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1140   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1141   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1142   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1143   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1144 
1145   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1146 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1148                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1149                                 upTriFactor->csrMat->values->data().get(),
1150                                 upTriFactor->csrMat->row_offsets->data().get(),
1151                                 upTriFactor->csrMat->column_indices->data().get(),
1152                                 upTriFactorT->csrMat->values->data().get(),
1153                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1155                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1156   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1157 #endif
1158 
1159   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1160   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1161                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1162                           upTriFactor->csrMat->values->data().get(),
1163                           upTriFactor->csrMat->row_offsets->data().get(),
1164                           upTriFactor->csrMat->column_indices->data().get(),
1165                           upTriFactorT->csrMat->values->data().get(),
1166                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1168                           CUSPARSE_ACTION_NUMERIC, indexBase,
1169                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1170                         #else
1171                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1173                         #endif
1174 
1175   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1176   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1177 
1178   /* Create the solve analysis information */
1179   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1180   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1181   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1183                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1184                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1185                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1186                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1187   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1188   #endif
1189 
1190   /* perform the solve analysis */
1191   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1192                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1193                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1194                            upTriFactorT->csrMat->column_indices->data().get(),
1195                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1196                            upTriFactorT->solveInfo,
1197                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1198                           #else
1199                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1200                           #endif
1201 
1202   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1203   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1204 
1205   /* assign the pointer */
1206   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 struct PetscScalarToPetscInt
1211 {
1212   __host__ __device__
1213   PetscInt operator()(PetscScalar s)
1214   {
1215     return (PetscInt)PetscRealPart(s);
1216   }
1217 };
1218 
1219 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1220 {
1221   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1222   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1223   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1224   cusparseStatus_t             stat;
1225   cusparseIndexBase_t          indexBase;
1226   cudaError_t                  err;
1227   PetscErrorCode               ierr;
1228 
1229   PetscFunctionBegin;
1230   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1231   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1232   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1233   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1234   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1235   if (A->transupdated) PetscFunctionReturn(0);
1236   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1237   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1238   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1239     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1240   }
1241   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1242     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1243     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1244     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1245     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1246     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1247 
1248     /* set alpha and beta */
1249     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1250     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1251     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1252     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1253     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1254     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255 
1256     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1257       CsrMatrix *matrixT = new CsrMatrix;
1258       matstructT->mat = matrixT;
1259       matrixT->num_rows = A->cmap->n;
1260       matrixT->num_cols = A->rmap->n;
1261       matrixT->num_entries = a->nz;
1262       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1263       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1264       matrixT->values = new THRUSTARRAY(a->nz);
1265 
1266       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1267       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1268 
1269      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1271         stat = cusparseCreateCsr(&matstructT->matDescr,
1272                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1273                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1274                                matrixT->values->data().get(),
1275                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1276                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1277       #else
1278         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1279            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1280 
1281            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1282            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1283            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1284         */
1285         if (matrixT->num_entries) {
1286           stat = cusparseCreateCsr(&matstructT->matDescr,
1287                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1288                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1289                                  matrixT->values->data().get(),
1290                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1291                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1292 
1293         } else {
1294           matstructT->matDescr = NULL;
1295           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1296         }
1297       #endif
1298      #endif
1299     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1300    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1301       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1302    #else
1303       CsrMatrix *temp  = new CsrMatrix;
1304       CsrMatrix *tempT = new CsrMatrix;
1305       /* First convert HYB to CSR */
1306       temp->num_rows = A->rmap->n;
1307       temp->num_cols = A->cmap->n;
1308       temp->num_entries = a->nz;
1309       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1310       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1311       temp->values = new THRUSTARRAY(a->nz);
1312 
1313       stat = cusparse_hyb2csr(cusparsestruct->handle,
1314                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1315                               temp->values->data().get(),
1316                               temp->row_offsets->data().get(),
1317                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1318 
1319       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1320       tempT->num_rows = A->rmap->n;
1321       tempT->num_cols = A->cmap->n;
1322       tempT->num_entries = a->nz;
1323       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1324       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1325       tempT->values = new THRUSTARRAY(a->nz);
1326 
1327       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1328                               temp->num_cols, temp->num_entries,
1329                               temp->values->data().get(),
1330                               temp->row_offsets->data().get(),
1331                               temp->column_indices->data().get(),
1332                               tempT->values->data().get(),
1333                               tempT->column_indices->data().get(),
1334                               tempT->row_offsets->data().get(),
1335                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1336 
1337       /* Last, convert CSC to HYB */
1338       cusparseHybMat_t hybMat;
1339       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1340       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1341         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1342       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1343                               matstructT->descr, tempT->values->data().get(),
1344                               tempT->row_offsets->data().get(),
1345                               tempT->column_indices->data().get(),
1346                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1347 
1348       /* assign the pointer */
1349       matstructT->mat = hybMat;
1350       A->transupdated = PETSC_TRUE;
1351       /* delete temporaries */
1352       if (tempT) {
1353         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1354         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1355         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1356         delete (CsrMatrix*) tempT;
1357       }
1358       if (temp) {
1359         if (temp->values) delete (THRUSTARRAY*) temp->values;
1360         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1361         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1362         delete (CsrMatrix*) temp;
1363       }
1364      #endif
1365     }
1366   }
1367   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1368     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1369     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1370     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1371     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1372     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1373     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1374     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1375     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1376     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1377     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1378     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1379       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1380       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1381       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1382     }
1383     if (!cusparsestruct->csr2csc_i) {
1384       THRUSTARRAY csr2csc_a(matrix->num_entries);
1385       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1386 
1387       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1388      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389       void   *csr2cscBuffer;
1390       size_t csr2cscBufferSize;
1391       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1392                                            A->cmap->n, matrix->num_entries,
1393                                            matrix->values->data().get(),
1394                                            cusparsestruct->rowoffsets_gpu->data().get(),
1395                                            matrix->column_indices->data().get(),
1396                                            matrixT->values->data().get(),
1397                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1399                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1400       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1401      #endif
1402 
1403       if (matrix->num_entries) {
1404         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1405            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1406            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1407 
1408            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1409            should be filled with indexBase. So I just take a shortcut here.
1410         */
1411         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1412                               A->cmap->n,matrix->num_entries,
1413                               csr2csc_a.data().get(),
1414                               cusparsestruct->rowoffsets_gpu->data().get(),
1415                               matrix->column_indices->data().get(),
1416                               matrixT->values->data().get(),
1417                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1418                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1419                               CUSPARSE_ACTION_NUMERIC,indexBase,
1420                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1421                              #else
1422                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1423                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1424                              #endif
1425       } else {
1426         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1427       }
1428 
1429       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1430       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1431      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1432       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1433      #endif
1434     }
1435     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1436                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1437                                                      matrixT->values->begin()));
1438   }
1439   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1440   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1441   /* the compressed row indices is not used for matTranspose */
1442   matstructT->cprowIndices = NULL;
1443   /* assign the pointer */
1444   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1445   A->transupdated = PETSC_TRUE;
1446   PetscFunctionReturn(0);
1447 }
1448 
1449 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1450 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1451 {
1452   PetscInt                              n = xx->map->n;
1453   const PetscScalar                     *barray;
1454   PetscScalar                           *xarray;
1455   thrust::device_ptr<const PetscScalar> bGPU;
1456   thrust::device_ptr<PetscScalar>       xGPU;
1457   cusparseStatus_t                      stat;
1458   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1459   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1462   PetscErrorCode                        ierr;
1463 
1464   PetscFunctionBegin;
1465   /* Analyze the matrix and create the transpose ... on the fly */
1466   if (!loTriFactorT && !upTriFactorT) {
1467     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1468     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1469     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1470   }
1471 
1472   /* Get the GPU pointers */
1473   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1474   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1475   xGPU = thrust::device_pointer_cast(xarray);
1476   bGPU = thrust::device_pointer_cast(barray);
1477 
1478   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1479   /* First, reorder with the row permutation */
1480   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1481                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1482                xGPU);
1483 
1484   /* First, solve U */
1485   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1486                         upTriFactorT->csrMat->num_rows,
1487                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1488                         upTriFactorT->csrMat->num_entries,
1489                       #endif
1490                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1491                         upTriFactorT->csrMat->values->data().get(),
1492                         upTriFactorT->csrMat->row_offsets->data().get(),
1493                         upTriFactorT->csrMat->column_indices->data().get(),
1494                         upTriFactorT->solveInfo,
1495                         xarray,
1496                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1497                         tempGPU->data().get(),
1498                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1499                       #else
1500                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1501                       #endif
1502 
1503   /* Then, solve L */
1504   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1505                         loTriFactorT->csrMat->num_rows,
1506                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1507                         loTriFactorT->csrMat->num_entries,
1508                       #endif
1509                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1510                         loTriFactorT->csrMat->values->data().get(),
1511                         loTriFactorT->csrMat->row_offsets->data().get(),
1512                         loTriFactorT->csrMat->column_indices->data().get(),
1513                         loTriFactorT->solveInfo,
1514                         tempGPU->data().get(),
1515                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1516                         xarray,
1517                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1518                       #else
1519                          xarray);CHKERRCUSPARSE(stat);
1520                       #endif
1521 
1522   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1523   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1524                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1525                tempGPU->begin());
1526 
1527   /* Copy the temporary to the full solution. */
1528   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1529 
1530   /* restore */
1531   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1532   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1533   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1534   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1539 {
1540   const PetscScalar                 *barray;
1541   PetscScalar                       *xarray;
1542   cusparseStatus_t                  stat;
1543   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1544   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1545   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1546   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1547   PetscErrorCode                    ierr;
1548 
1549   PetscFunctionBegin;
1550   /* Analyze the matrix and create the transpose ... on the fly */
1551   if (!loTriFactorT && !upTriFactorT) {
1552     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1553     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1554     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1555   }
1556 
1557   /* Get the GPU pointers */
1558   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1559   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1560 
1561   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1562   /* First, solve U */
1563   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1564                         upTriFactorT->csrMat->num_rows,
1565                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1566                         upTriFactorT->csrMat->num_entries,
1567                       #endif
1568                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1569                         upTriFactorT->csrMat->values->data().get(),
1570                         upTriFactorT->csrMat->row_offsets->data().get(),
1571                         upTriFactorT->csrMat->column_indices->data().get(),
1572                         upTriFactorT->solveInfo,
1573                         barray,
1574                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1575                         tempGPU->data().get(),
1576                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1577                       #else
1578                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1579                       #endif
1580 
1581   /* Then, solve L */
1582   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1583                         loTriFactorT->csrMat->num_rows,
1584                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1585                         loTriFactorT->csrMat->num_entries,
1586                       #endif
1587                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1588                         loTriFactorT->csrMat->values->data().get(),
1589                         loTriFactorT->csrMat->row_offsets->data().get(),
1590                         loTriFactorT->csrMat->column_indices->data().get(),
1591                         loTriFactorT->solveInfo,
1592                         tempGPU->data().get(),
1593                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1594                         xarray,
1595                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1596                       #else
1597                         xarray);CHKERRCUSPARSE(stat);
1598                       #endif
1599 
1600   /* restore */
1601   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1602   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1604   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1609 {
1610   const PetscScalar                     *barray;
1611   PetscScalar                           *xarray;
1612   thrust::device_ptr<const PetscScalar> bGPU;
1613   thrust::device_ptr<PetscScalar>       xGPU;
1614   cusparseStatus_t                      stat;
1615   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1616   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1617   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1618   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1619   PetscErrorCode                        ierr;
1620 
1621   PetscFunctionBegin;
1622 
1623   /* Get the GPU pointers */
1624   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1625   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1626   xGPU = thrust::device_pointer_cast(xarray);
1627   bGPU = thrust::device_pointer_cast(barray);
1628 
1629   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1630   /* First, reorder with the row permutation */
1631   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1632                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1633                tempGPU->begin());
1634 
1635   /* Next, solve L */
1636   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1637                         loTriFactor->csrMat->num_rows,
1638                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1639                         loTriFactor->csrMat->num_entries,
1640                       #endif
1641                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1642                         loTriFactor->csrMat->values->data().get(),
1643                         loTriFactor->csrMat->row_offsets->data().get(),
1644                         loTriFactor->csrMat->column_indices->data().get(),
1645                         loTriFactor->solveInfo,
1646                         tempGPU->data().get(),
1647                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1648                          xarray,
1649                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1650                       #else
1651                          xarray);CHKERRCUSPARSE(stat);
1652                       #endif
1653 
1654   /* Then, solve U */
1655   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1656                         upTriFactor->csrMat->num_rows,
1657                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1658                         upTriFactor->csrMat->num_entries,
1659                       #endif
1660                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1661                         upTriFactor->csrMat->values->data().get(),
1662                         upTriFactor->csrMat->row_offsets->data().get(),
1663                         upTriFactor->csrMat->column_indices->data().get(),
1664                         upTriFactor->solveInfo,xarray,
1665                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1666                         tempGPU->data().get(),
1667                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1668                       #else
1669                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1670                       #endif
1671 
1672   /* Last, reorder with the column permutation */
1673   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1674                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1675                xGPU);
1676 
1677   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1679   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1680   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1681   PetscFunctionReturn(0);
1682 }
1683 
1684 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1685 {
1686   const PetscScalar                 *barray;
1687   PetscScalar                       *xarray;
1688   cusparseStatus_t                  stat;
1689   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1690   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1691   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1692   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1693   PetscErrorCode                    ierr;
1694 
1695   PetscFunctionBegin;
1696   /* Get the GPU pointers */
1697   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1698   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1699 
1700   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1701   /* First, solve L */
1702   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1703                         loTriFactor->csrMat->num_rows,
1704                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1705                         loTriFactor->csrMat->num_entries,
1706                       #endif
1707                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1708                         loTriFactor->csrMat->values->data().get(),
1709                         loTriFactor->csrMat->row_offsets->data().get(),
1710                         loTriFactor->csrMat->column_indices->data().get(),
1711                         loTriFactor->solveInfo,
1712                         barray,
1713                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1714                         tempGPU->data().get(),
1715                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1716                       #else
1717                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1718                       #endif
1719 
1720   /* Next, solve U */
1721   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1722                         upTriFactor->csrMat->num_rows,
1723                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1724                         upTriFactor->csrMat->num_entries,
1725                       #endif
1726                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1727                         upTriFactor->csrMat->values->data().get(),
1728                         upTriFactor->csrMat->row_offsets->data().get(),
1729                         upTriFactor->csrMat->column_indices->data().get(),
1730                         upTriFactor->solveInfo,
1731                         tempGPU->data().get(),
1732                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1733                         xarray,
1734                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1735                       #else
1736                         xarray);CHKERRCUSPARSE(stat);
1737                       #endif
1738 
1739   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1740   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1741   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1742   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1747 {
1748   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1749   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1750   cudaError_t        cerr;
1751   PetscErrorCode     ierr;
1752 
1753   PetscFunctionBegin;
1754   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1755     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1756 
1757     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1758     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1759     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1760     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1761     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1762     A->offloadmask = PETSC_OFFLOAD_BOTH;
1763   }
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1768 {
1769   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1770   PetscErrorCode ierr;
1771 
1772   PetscFunctionBegin;
1773   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1774   *array = a->a;
1775   A->offloadmask = PETSC_OFFLOAD_CPU;
1776   PetscFunctionReturn(0);
1777 }
1778 
1779 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1780 {
1781   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1782   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1783   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1784   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1785   PetscErrorCode               ierr;
1786   cusparseStatus_t             stat;
1787   PetscBool                    both = PETSC_TRUE;
1788   cudaError_t                  err;
1789 
1790   PetscFunctionBegin;
1791   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1792   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1793     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1794       CsrMatrix *matrix;
1795       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1796 
1797       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1798       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1799       matrix->values->assign(a->a, a->a+a->nz);
1800       err  = WaitForCUDA();CHKERRCUDA(err);
1801       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1802       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1803       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1804     } else {
1805       PetscInt nnz;
1806       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1807       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1808       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1809       delete cusparsestruct->workVector;
1810       delete cusparsestruct->rowoffsets_gpu;
1811       cusparsestruct->workVector = NULL;
1812       cusparsestruct->rowoffsets_gpu = NULL;
1813       try {
1814         if (a->compressedrow.use) {
1815           m    = a->compressedrow.nrows;
1816           ii   = a->compressedrow.i;
1817           ridx = a->compressedrow.rindex;
1818         } else {
1819           m    = A->rmap->n;
1820           ii   = a->i;
1821           ridx = NULL;
1822         }
1823         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1824         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1825         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1826         else nnz = a->nz;
1827 
1828         /* create cusparse matrix */
1829         cusparsestruct->nrows = m;
1830         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1831         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1832         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1833         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1834 
1835         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1836         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1837         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1838         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1839         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1840         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1841         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1842 
1843         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1844         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1845           /* set the matrix */
1846           CsrMatrix *mat= new CsrMatrix;
1847           mat->num_rows = m;
1848           mat->num_cols = A->cmap->n;
1849           mat->num_entries = nnz;
1850           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1851           mat->row_offsets->assign(ii, ii + m+1);
1852 
1853           mat->column_indices = new THRUSTINTARRAY32(nnz);
1854           mat->column_indices->assign(a->j, a->j+nnz);
1855 
1856           mat->values = new THRUSTARRAY(nnz);
1857           if (a->a) mat->values->assign(a->a, a->a+nnz);
1858 
1859           /* assign the pointer */
1860           matstruct->mat = mat;
1861          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1863             stat = cusparseCreateCsr(&matstruct->matDescr,
1864                                     mat->num_rows, mat->num_cols, mat->num_entries,
1865                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1866                                     mat->values->data().get(),
1867                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1868                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1869           }
1870          #endif
1871         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1872          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1873           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1874          #else
1875           CsrMatrix *mat= new CsrMatrix;
1876           mat->num_rows = m;
1877           mat->num_cols = A->cmap->n;
1878           mat->num_entries = nnz;
1879           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1880           mat->row_offsets->assign(ii, ii + m+1);
1881 
1882           mat->column_indices = new THRUSTINTARRAY32(nnz);
1883           mat->column_indices->assign(a->j, a->j+nnz);
1884 
1885           mat->values = new THRUSTARRAY(nnz);
1886           if (a->a) mat->values->assign(a->a, a->a+nnz);
1887 
1888           cusparseHybMat_t hybMat;
1889           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1890           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1891             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1892           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1893               matstruct->descr, mat->values->data().get(),
1894               mat->row_offsets->data().get(),
1895               mat->column_indices->data().get(),
1896               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1897           /* assign the pointer */
1898           matstruct->mat = hybMat;
1899 
1900           if (mat) {
1901             if (mat->values) delete (THRUSTARRAY*)mat->values;
1902             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1903             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1904             delete (CsrMatrix*)mat;
1905           }
1906          #endif
1907         }
1908 
1909         /* assign the compressed row indices */
1910         if (a->compressedrow.use) {
1911           cusparsestruct->workVector = new THRUSTARRAY(m);
1912           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1913           matstruct->cprowIndices->assign(ridx,ridx+m);
1914           tmp = m;
1915         } else {
1916           cusparsestruct->workVector = NULL;
1917           matstruct->cprowIndices    = NULL;
1918           tmp = 0;
1919         }
1920         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1921 
1922         /* assign the pointer */
1923         cusparsestruct->mat = matstruct;
1924       } catch(char *ex) {
1925         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1926       }
1927       err  = WaitForCUDA();CHKERRCUDA(err);
1928       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1929       cusparsestruct->nonzerostate = A->nonzerostate;
1930     }
1931     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1932   }
1933   PetscFunctionReturn(0);
1934 }
1935 
1936 struct VecCUDAPlusEquals
1937 {
1938   template <typename Tuple>
1939   __host__ __device__
1940   void operator()(Tuple t)
1941   {
1942     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1943   }
1944 };
1945 
1946 struct VecCUDAEquals
1947 {
1948   template <typename Tuple>
1949   __host__ __device__
1950   void operator()(Tuple t)
1951   {
1952     thrust::get<1>(t) = thrust::get<0>(t);
1953   }
1954 };
1955 
1956 struct VecCUDAEqualsReverse
1957 {
1958   template <typename Tuple>
1959   __host__ __device__
1960   void operator()(Tuple t)
1961   {
1962     thrust::get<0>(t) = thrust::get<1>(t);
1963   }
1964 };
1965 
1966 struct MatMatCusparse {
1967   PetscBool             cisdense;
1968   PetscScalar           *Bt;
1969   Mat                   X;
1970   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1971   PetscLogDouble        flops;
1972   CsrMatrix             *Bcsr;
1973 
1974 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1975   cusparseSpMatDescr_t  matSpBDescr;
1976   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1977   cusparseDnMatDescr_t  matBDescr;
1978   cusparseDnMatDescr_t  matCDescr;
1979   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1980  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1981   void                  *dBuffer4;
1982   void                  *dBuffer5;
1983  #endif
1984   size_t                mmBufferSize;
1985   void                  *mmBuffer;
1986   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1987   cusparseSpGEMMDescr_t spgemmDesc;
1988 #endif
1989 };
1990 
1991 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1992 {
1993   PetscErrorCode   ierr;
1994   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1995   cudaError_t      cerr;
1996  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1997   cusparseStatus_t stat;
1998  #endif
1999 
2000   PetscFunctionBegin;
2001   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2002   delete mmdata->Bcsr;
2003  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2004   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2005   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2006   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2007   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2008  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2009   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2010   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2011  #endif
2012   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2013   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2014  #endif
2015   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2016   ierr = PetscFree(data);CHKERRQ(ierr);
2017   PetscFunctionReturn(0);
2018 }
2019 
2020 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2021 
2022 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2023 {
2024   Mat_Product                  *product = C->product;
2025   Mat                          A,B;
2026   PetscInt                     m,n,blda,clda;
2027   PetscBool                    flg,biscuda;
2028   Mat_SeqAIJCUSPARSE           *cusp;
2029   cusparseStatus_t             stat;
2030   cusparseOperation_t          opA;
2031   const PetscScalar            *barray;
2032   PetscScalar                  *carray;
2033   PetscErrorCode               ierr;
2034   MatMatCusparse               *mmdata;
2035   Mat_SeqAIJCUSPARSEMultStruct *mat;
2036   CsrMatrix                    *csrmat;
2037 
2038   PetscFunctionBegin;
2039   MatCheckProduct(C,1);
2040   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2041   mmdata = (MatMatCusparse*)product->data;
2042   A    = product->A;
2043   B    = product->B;
2044   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2045   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2046   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2047      Instead of silently accepting the wrong answer, I prefer to raise the error */
2048   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2049   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2050   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2051   switch (product->type) {
2052   case MATPRODUCT_AB:
2053   case MATPRODUCT_PtAP:
2054     mat = cusp->mat;
2055     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056     m   = A->rmap->n;
2057     n   = B->cmap->n;
2058     break;
2059   case MATPRODUCT_AtB:
2060     if (!A->form_explicit_transpose) {
2061       mat = cusp->mat;
2062       opA = CUSPARSE_OPERATION_TRANSPOSE;
2063     } else {
2064       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2065       mat  = cusp->matTranspose;
2066       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2067     }
2068     m = A->cmap->n;
2069     n = B->cmap->n;
2070     break;
2071   case MATPRODUCT_ABt:
2072   case MATPRODUCT_RARt:
2073     mat = cusp->mat;
2074     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2075     m   = A->rmap->n;
2076     n   = B->rmap->n;
2077     break;
2078   default:
2079     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2080   }
2081   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2082   csrmat = (CsrMatrix*)mat->mat;
2083   /* if the user passed a CPU matrix, copy the data to the GPU */
2084   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2085   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2086   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2087 
2088   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2089   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2090     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2091     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2092   } else {
2093     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2094     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2095   }
2096 
2097   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2098  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2099   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2100   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2101   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2102     size_t mmBufferSize;
2103     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2104     if (!mmdata->matBDescr) {
2105       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2106       mmdata->Blda = blda;
2107     }
2108 
2109     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2110     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2111       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2112       mmdata->Clda = clda;
2113     }
2114 
2115     if (!mat->matDescr) {
2116       stat = cusparseCreateCsr(&mat->matDescr,
2117                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2118                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2119                                csrmat->values->data().get(),
2120                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2121                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2122     }
2123     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2124                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2125                                    mmdata->matCDescr,cusparse_scalartype,
2126                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2127     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2128       cudaError_t cerr;
2129       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2130       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2131       mmdata->mmBufferSize = mmBufferSize;
2132     }
2133     mmdata->initialized = PETSC_TRUE;
2134   } else {
2135     /* to be safe, always update pointers of the mats */
2136     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2137     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2138     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2139   }
2140 
2141   /* do cusparseSpMM, which supports transpose on B */
2142   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2143                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2144                       mmdata->matCDescr,cusparse_scalartype,
2145                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2146  #else
2147   PetscInt k;
2148   /* cusparseXcsrmm does not support transpose on B */
2149   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2150     cublasHandle_t cublasv2handle;
2151     cublasStatus_t cerr;
2152 
2153     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2154     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2155                        B->cmap->n,B->rmap->n,
2156                        &PETSC_CUSPARSE_ONE ,barray,blda,
2157                        &PETSC_CUSPARSE_ZERO,barray,blda,
2158                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2159     blda = B->cmap->n;
2160     k    = B->cmap->n;
2161   } else {
2162     k    = B->rmap->n;
2163   }
2164 
2165   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2166   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2167                            csrmat->num_entries,mat->alpha_one,mat->descr,
2168                            csrmat->values->data().get(),
2169                            csrmat->row_offsets->data().get(),
2170                            csrmat->column_indices->data().get(),
2171                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2172                            carray,clda);CHKERRCUSPARSE(stat);
2173  #endif
2174   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2175   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2176   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2177   if (product->type == MATPRODUCT_RARt) {
2178     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2179     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2180   } else if (product->type == MATPRODUCT_PtAP) {
2181     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2182     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2183   } else {
2184     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2185   }
2186   if (mmdata->cisdense) {
2187     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2188   }
2189   if (!biscuda) {
2190     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2191   }
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2196 {
2197   Mat_Product        *product = C->product;
2198   Mat                A,B;
2199   PetscInt           m,n;
2200   PetscBool          cisdense,flg;
2201   PetscErrorCode     ierr;
2202   MatMatCusparse     *mmdata;
2203   Mat_SeqAIJCUSPARSE *cusp;
2204 
2205   PetscFunctionBegin;
2206   MatCheckProduct(C,1);
2207   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2208   A    = product->A;
2209   B    = product->B;
2210   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2211   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2212   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2213   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2214   switch (product->type) {
2215   case MATPRODUCT_AB:
2216     m = A->rmap->n;
2217     n = B->cmap->n;
2218     break;
2219   case MATPRODUCT_AtB:
2220     m = A->cmap->n;
2221     n = B->cmap->n;
2222     break;
2223   case MATPRODUCT_ABt:
2224     m = A->rmap->n;
2225     n = B->rmap->n;
2226     break;
2227   case MATPRODUCT_PtAP:
2228     m = B->cmap->n;
2229     n = B->cmap->n;
2230     break;
2231   case MATPRODUCT_RARt:
2232     m = B->rmap->n;
2233     n = B->rmap->n;
2234     break;
2235   default:
2236     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2237   }
2238   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2239   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2240   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2241   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2242 
2243   /* product data */
2244   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2245   mmdata->cisdense = cisdense;
2246  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2247   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2248   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2249     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2250   }
2251  #endif
2252   /* for these products we need intermediate storage */
2253   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2254     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2255     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2256     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2257       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2258     } else {
2259       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2260     }
2261   }
2262   C->product->data    = mmdata;
2263   C->product->destroy = MatDestroy_MatMatCusparse;
2264 
2265   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2266   PetscFunctionReturn(0);
2267 }
2268 
2269 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2270 {
2271   Mat_Product                  *product = C->product;
2272   Mat                          A,B;
2273   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2274   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2275   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2276   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2277   PetscBool                    flg;
2278   PetscErrorCode               ierr;
2279   cusparseStatus_t             stat;
2280   cudaError_t                  cerr;
2281   MatProductType               ptype;
2282   MatMatCusparse               *mmdata;
2283 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2284   cusparseSpMatDescr_t         BmatSpDescr;
2285 #endif
2286   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2287 
2288   PetscFunctionBegin;
2289   MatCheckProduct(C,1);
2290   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2291   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2292   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2293   mmdata = (MatMatCusparse*)C->product->data;
2294   A = product->A;
2295   B = product->B;
2296   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2297     mmdata->reusesym = PETSC_FALSE;
2298     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2299     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2300     Cmat = Ccusp->mat;
2301     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2302     Ccsr = (CsrMatrix*)Cmat->mat;
2303     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2304     goto finalize;
2305   }
2306   if (!c->nz) goto finalize;
2307   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2308   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2309   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2310   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2311   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2312   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2313   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2314   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2315   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2316   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2317   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2318   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2319   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2320   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2321 
2322   ptype = product->type;
2323   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2324     ptype = MATPRODUCT_AB;
2325     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2326   }
2327   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2328     ptype = MATPRODUCT_AB;
2329     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2330   }
2331   switch (ptype) {
2332   case MATPRODUCT_AB:
2333     Amat = Acusp->mat;
2334     Bmat = Bcusp->mat;
2335     break;
2336   case MATPRODUCT_AtB:
2337     Amat = Acusp->matTranspose;
2338     Bmat = Bcusp->mat;
2339     break;
2340   case MATPRODUCT_ABt:
2341     Amat = Acusp->mat;
2342     Bmat = Bcusp->matTranspose;
2343     break;
2344   default:
2345     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2346   }
2347   Cmat = Ccusp->mat;
2348   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2349   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2350   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2351   Acsr = (CsrMatrix*)Amat->mat;
2352   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2353   Ccsr = (CsrMatrix*)Cmat->mat;
2354   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2355   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2356   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2357   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2358 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2359   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2360   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2361   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2362     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2363                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2364                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2365                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2366   #else
2367     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2368                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2369                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2370                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2371     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2372                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2373                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2374   #endif
2375 #else
2376   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2377                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2378                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2379                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2380                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2381 #endif
2382   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2383   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2384   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2385   C->offloadmask = PETSC_OFFLOAD_GPU;
2386 finalize:
2387   /* shorter version of MatAssemblyEnd_SeqAIJ */
2388   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2389   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2390   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2391   c->reallocs         = 0;
2392   C->info.mallocs    += 0;
2393   C->info.nz_unneeded = 0;
2394   C->assembled = C->was_assembled = PETSC_TRUE;
2395   C->num_ass++;
2396   PetscFunctionReturn(0);
2397 }
2398 
2399 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2400 {
2401   Mat_Product                  *product = C->product;
2402   Mat                          A,B;
2403   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2404   Mat_SeqAIJ                   *a,*b,*c;
2405   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2406   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2407   PetscInt                     i,j,m,n,k;
2408   PetscBool                    flg;
2409   PetscErrorCode               ierr;
2410   cusparseStatus_t             stat;
2411   cudaError_t                  cerr;
2412   MatProductType               ptype;
2413   MatMatCusparse               *mmdata;
2414   PetscLogDouble               flops;
2415   PetscBool                    biscompressed,ciscompressed;
2416 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2417   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2418   cusparseSpMatDescr_t         BmatSpDescr;
2419 #else
2420   int                          cnz;
2421 #endif
2422   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2423 
2424   PetscFunctionBegin;
2425   MatCheckProduct(C,1);
2426   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2427   A    = product->A;
2428   B    = product->B;
2429   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2430   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2431   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2432   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2433   a = (Mat_SeqAIJ*)A->data;
2434   b = (Mat_SeqAIJ*)B->data;
2435   /* product data */
2436   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2437   C->product->data    = mmdata;
2438   C->product->destroy = MatDestroy_MatMatCusparse;
2439 
2440   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2441   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2442   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2443   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2444   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2445   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2446 
2447   ptype = product->type;
2448   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2449     ptype = MATPRODUCT_AB;
2450     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2451   }
2452   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2453     ptype = MATPRODUCT_AB;
2454     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2455   }
2456   biscompressed = PETSC_FALSE;
2457   ciscompressed = PETSC_FALSE;
2458   switch (ptype) {
2459   case MATPRODUCT_AB:
2460     m = A->rmap->n;
2461     n = B->cmap->n;
2462     k = A->cmap->n;
2463     Amat = Acusp->mat;
2464     Bmat = Bcusp->mat;
2465     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2466     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2467     break;
2468   case MATPRODUCT_AtB:
2469     m = A->cmap->n;
2470     n = B->cmap->n;
2471     k = A->rmap->n;
2472     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2473     Amat = Acusp->matTranspose;
2474     Bmat = Bcusp->mat;
2475     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2476     break;
2477   case MATPRODUCT_ABt:
2478     m = A->rmap->n;
2479     n = B->rmap->n;
2480     k = A->cmap->n;
2481     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2482     Amat = Acusp->mat;
2483     Bmat = Bcusp->matTranspose;
2484     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2485     break;
2486   default:
2487     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2488   }
2489 
2490   /* create cusparse matrix */
2491   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2492   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2493   c     = (Mat_SeqAIJ*)C->data;
2494   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2495   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2496   Ccsr  = new CsrMatrix;
2497 
2498   c->compressedrow.use = ciscompressed;
2499   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2500     c->compressedrow.nrows = a->compressedrow.nrows;
2501     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2502     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2503     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2504     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2505     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2506   } else {
2507     c->compressedrow.nrows  = 0;
2508     c->compressedrow.i      = NULL;
2509     c->compressedrow.rindex = NULL;
2510     Ccusp->workVector       = NULL;
2511     Cmat->cprowIndices      = NULL;
2512   }
2513   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2514   Ccusp->mat      = Cmat;
2515   Ccusp->mat->mat = Ccsr;
2516   Ccsr->num_rows    = Ccusp->nrows;
2517   Ccsr->num_cols    = n;
2518   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2519   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2520   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2521   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2522   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2523   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2524   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2525   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2526   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2527   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2528   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2529     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2530     c->nz = 0;
2531     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2532     Ccsr->values = new THRUSTARRAY(c->nz);
2533     goto finalizesym;
2534   }
2535 
2536   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2537   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2538   Acsr = (CsrMatrix*)Amat->mat;
2539   if (!biscompressed) {
2540     Bcsr = (CsrMatrix*)Bmat->mat;
2541 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2542     BmatSpDescr = Bmat->matDescr;
2543 #endif
2544   } else { /* we need to use row offsets for the full matrix */
2545     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2546     Bcsr = new CsrMatrix;
2547     Bcsr->num_rows       = B->rmap->n;
2548     Bcsr->num_cols       = cBcsr->num_cols;
2549     Bcsr->num_entries    = cBcsr->num_entries;
2550     Bcsr->column_indices = cBcsr->column_indices;
2551     Bcsr->values         = cBcsr->values;
2552     if (!Bcusp->rowoffsets_gpu) {
2553       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2554       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2555       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2556     }
2557     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2558     mmdata->Bcsr = Bcsr;
2559 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2560     if (Bcsr->num_rows && Bcsr->num_cols) {
2561       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2562                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2563                                Bcsr->values->data().get(),
2564                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2565                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2566     }
2567     BmatSpDescr = mmdata->matSpBDescr;
2568 #endif
2569   }
2570   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2571   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2572   /* precompute flops count */
2573   if (ptype == MATPRODUCT_AB) {
2574     for (i=0, flops = 0; i<A->rmap->n; i++) {
2575       const PetscInt st = a->i[i];
2576       const PetscInt en = a->i[i+1];
2577       for (j=st; j<en; j++) {
2578         const PetscInt brow = a->j[j];
2579         flops += 2.*(b->i[brow+1] - b->i[brow]);
2580       }
2581     }
2582   } else if (ptype == MATPRODUCT_AtB) {
2583     for (i=0, flops = 0; i<A->rmap->n; i++) {
2584       const PetscInt anzi = a->i[i+1] - a->i[i];
2585       const PetscInt bnzi = b->i[i+1] - b->i[i];
2586       flops += (2.*anzi)*bnzi;
2587     }
2588   } else { /* TODO */
2589     flops = 0.;
2590   }
2591 
2592   mmdata->flops = flops;
2593   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2594 
2595 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2596   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2597   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2598                           NULL, NULL, NULL,
2599                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2600                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2601   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2602  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2603  {
2604   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2605      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2606   */
2607   void*  dBuffer1 = NULL;
2608   void*  dBuffer2 = NULL;
2609   void*  dBuffer3 = NULL;
2610   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2611   size_t bufferSize1 = 0;
2612   size_t bufferSize2 = 0;
2613   size_t bufferSize3 = 0;
2614   size_t bufferSize4 = 0;
2615   size_t bufferSize5 = 0;
2616 
2617   /*----------------------------------------------------------------------*/
2618   /* ask bufferSize1 bytes for external memory */
2619   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2620                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2621                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2622   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2623   /* inspect the matrices A and B to understand the memory requirement for the next step */
2624   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2625                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2626                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2627 
2628   /*----------------------------------------------------------------------*/
2629   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2630                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2631                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2632   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2633   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2634   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2635   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2636                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2637                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2638   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2639   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2640 
2641   /*----------------------------------------------------------------------*/
2642   /* get matrix C non-zero entries C_nnz1 */
2643   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2644   c->nz = (PetscInt) C_nnz1;
2645   /* allocate matrix C */
2646   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2647   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2648   /* update matC with the new pointers */
2649   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2650                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2651 
2652   /*----------------------------------------------------------------------*/
2653   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2654                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2655                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2656   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2657   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2658                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2659                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2660   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2661   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2662                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2663                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2664                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2665   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2666  }
2667  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2668   size_t bufSize2;
2669   /* ask bufferSize bytes for external memory */
2670   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2671                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2672                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2673                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2674   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2675   /* inspect the matrices A and B to understand the memory requirement for the next step */
2676   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2677                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2678                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2679                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2680   /* ask bufferSize again bytes for external memory */
2681   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2682                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2685   /* The CUSPARSE documentation is not clear, nor the API
2686      We need both buffers to perform the operations properly!
2687      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2688      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2689      is stored in the descriptor! What a messy API... */
2690   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2691   /* compute the intermediate product of A * B */
2692   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2693                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2694                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2695                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2696   /* get matrix C non-zero entries C_nnz1 */
2697   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2698   c->nz = (PetscInt) C_nnz1;
2699   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2700   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2701   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2702   Ccsr->values = new THRUSTARRAY(c->nz);
2703   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2704   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2705                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2706   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2707                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2708                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2709  #endif
2710 #else
2711   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2712   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2713                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2714                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2715                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2716                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2717   c->nz = cnz;
2718   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2719   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2720   Ccsr->values = new THRUSTARRAY(c->nz);
2721   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2722 
2723   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2724   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2725      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2726      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2727   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2728                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2729                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2730                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2731                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2732 #endif
2733   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2734   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2735 finalizesym:
2736   c->singlemalloc = PETSC_FALSE;
2737   c->free_a       = PETSC_TRUE;
2738   c->free_ij      = PETSC_TRUE;
2739   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2740   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2741   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2742     PetscInt *d_i = c->i;
2743     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2744     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2745     ii   = *Ccsr->row_offsets;
2746     jj   = *Ccsr->column_indices;
2747     if (ciscompressed) d_i = c->compressedrow.i;
2748     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2749     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2750   } else {
2751     PetscInt *d_i = c->i;
2752     if (ciscompressed) d_i = c->compressedrow.i;
2753     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2754     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2755   }
2756   if (ciscompressed) { /* need to expand host row offsets */
2757     PetscInt r = 0;
2758     c->i[0] = 0;
2759     for (k = 0; k < c->compressedrow.nrows; k++) {
2760       const PetscInt next = c->compressedrow.rindex[k];
2761       const PetscInt old = c->compressedrow.i[k];
2762       for (; r < next; r++) c->i[r+1] = old;
2763     }
2764     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2765   }
2766   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2767   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2768   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2769   c->maxnz = c->nz;
2770   c->nonzerorowcnt = 0;
2771   c->rmax = 0;
2772   for (k = 0; k < m; k++) {
2773     const PetscInt nn = c->i[k+1] - c->i[k];
2774     c->ilen[k] = c->imax[k] = nn;
2775     c->nonzerorowcnt += (PetscInt)!!nn;
2776     c->rmax = PetscMax(c->rmax,nn);
2777   }
2778   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2779   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2780   Ccsr->num_entries = c->nz;
2781 
2782   C->nonzerostate++;
2783   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2784   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2785   Ccusp->nonzerostate = C->nonzerostate;
2786   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2787   C->preallocated  = PETSC_TRUE;
2788   C->assembled     = PETSC_FALSE;
2789   C->was_assembled = PETSC_FALSE;
2790   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2791     mmdata->reusesym = PETSC_TRUE;
2792     C->offloadmask   = PETSC_OFFLOAD_GPU;
2793   }
2794   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2795   PetscFunctionReturn(0);
2796 }
2797 
2798 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2799 
2800 /* handles sparse or dense B */
2801 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2802 {
2803   Mat_Product    *product = mat->product;
2804   PetscErrorCode ierr;
2805   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2806 
2807   PetscFunctionBegin;
2808   MatCheckProduct(mat,1);
2809   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2810   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2811     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2812   }
2813   if (product->type == MATPRODUCT_ABC) {
2814     Ciscusp = PETSC_FALSE;
2815     if (!product->C->boundtocpu) {
2816       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2817     }
2818   }
2819   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2820     PetscBool usecpu = PETSC_FALSE;
2821     switch (product->type) {
2822     case MATPRODUCT_AB:
2823       if (product->api_user) {
2824         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2825         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2826         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2827       } else {
2828         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2829         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2830         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2831       }
2832       break;
2833     case MATPRODUCT_AtB:
2834       if (product->api_user) {
2835         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2836         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2837         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2838       } else {
2839         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2840         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2841         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2842       }
2843       break;
2844     case MATPRODUCT_PtAP:
2845       if (product->api_user) {
2846         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2847         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2848         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2849       } else {
2850         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2851         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2852         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2853       }
2854       break;
2855     case MATPRODUCT_RARt:
2856       if (product->api_user) {
2857         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2858         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2859         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2860       } else {
2861         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2862         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2863         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2864       }
2865       break;
2866     case MATPRODUCT_ABC:
2867       if (product->api_user) {
2868         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2869         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2870         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2871       } else {
2872         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2873         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2874         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2875       }
2876       break;
2877     default:
2878       break;
2879     }
2880     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2881   }
2882   /* dispatch */
2883   if (isdense) {
2884     switch (product->type) {
2885     case MATPRODUCT_AB:
2886     case MATPRODUCT_AtB:
2887     case MATPRODUCT_ABt:
2888     case MATPRODUCT_PtAP:
2889     case MATPRODUCT_RARt:
2890      if (product->A->boundtocpu) {
2891         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2892       } else {
2893         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2894       }
2895       break;
2896     case MATPRODUCT_ABC:
2897       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2898       break;
2899     default:
2900       break;
2901     }
2902   } else if (Biscusp && Ciscusp) {
2903     switch (product->type) {
2904     case MATPRODUCT_AB:
2905     case MATPRODUCT_AtB:
2906     case MATPRODUCT_ABt:
2907       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2908       break;
2909     case MATPRODUCT_PtAP:
2910     case MATPRODUCT_RARt:
2911     case MATPRODUCT_ABC:
2912       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2913       break;
2914     default:
2915       break;
2916     }
2917   } else { /* fallback for AIJ */
2918     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2919   }
2920   PetscFunctionReturn(0);
2921 }
2922 
2923 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2924 {
2925   PetscErrorCode ierr;
2926 
2927   PetscFunctionBegin;
2928   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2929   PetscFunctionReturn(0);
2930 }
2931 
2932 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2933 {
2934   PetscErrorCode ierr;
2935 
2936   PetscFunctionBegin;
2937   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2938   PetscFunctionReturn(0);
2939 }
2940 
2941 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2942 {
2943   PetscErrorCode ierr;
2944 
2945   PetscFunctionBegin;
2946   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2947   PetscFunctionReturn(0);
2948 }
2949 
2950 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2951 {
2952   PetscErrorCode ierr;
2953 
2954   PetscFunctionBegin;
2955   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2956   PetscFunctionReturn(0);
2957 }
2958 
2959 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2960 {
2961   PetscErrorCode ierr;
2962 
2963   PetscFunctionBegin;
2964   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2969 {
2970   int i = blockIdx.x*blockDim.x + threadIdx.x;
2971   if (i < n) y[idx[i]] += x[i];
2972 }
2973 
2974 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2975 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2976 {
2977   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2978   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2979   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2980   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2981   PetscErrorCode               ierr;
2982   cusparseStatus_t             stat;
2983   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2984   PetscBool                    compressed;
2985 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2986   PetscInt                     nx,ny;
2987 #endif
2988 
2989   PetscFunctionBegin;
2990   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2991   if (!a->nonzerorowcnt) {
2992     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2993     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2994     PetscFunctionReturn(0);
2995   }
2996   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2997   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2998   if (!trans) {
2999     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3000     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3001   } else {
3002     if (herm || !A->form_explicit_transpose) {
3003       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3004       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3005     } else {
3006       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3007       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3008     }
3009   }
3010   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3011   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3012 
3013   try {
3014     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3015     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3016     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3017 
3018     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3019     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3020       /* z = A x + beta y.
3021          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3022          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3023       */
3024       xptr = xarray;
3025       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3026       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3027      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3028       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3029           allocated to accommodate different uses. So we get the length info directly from mat.
3030        */
3031       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3032         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3033         nx = mat->num_cols;
3034         ny = mat->num_rows;
3035       }
3036      #endif
3037     } else {
3038       /* z = A^T x + beta y
3039          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3040          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3041        */
3042       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3043       dptr = zarray;
3044       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3045       if (compressed) { /* Scatter x to work vector */
3046         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3047         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3048                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3049                          VecCUDAEqualsReverse());
3050       }
3051      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3052       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3053         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3054         nx = mat->num_rows;
3055         ny = mat->num_cols;
3056       }
3057      #endif
3058     }
3059 
3060     /* csr_spmv does y = alpha op(A) x + beta y */
3061     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3062      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3063       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3064       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3065         cudaError_t cerr;
3066         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3067         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3068         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3069                                 matstruct->matDescr,
3070                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3071                                 matstruct->cuSpMV[opA].vecYDescr,
3072                                 cusparse_scalartype,
3073                                 cusparsestruct->spmvAlg,
3074                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3075         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3076 
3077         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3078       } else {
3079         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3080         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3081         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3082       }
3083 
3084       stat = cusparseSpMV(cusparsestruct->handle, opA,
3085                                matstruct->alpha_one,
3086                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3087                                matstruct->cuSpMV[opA].vecXDescr,
3088                                beta,
3089                                matstruct->cuSpMV[opA].vecYDescr,
3090                                cusparse_scalartype,
3091                                cusparsestruct->spmvAlg,
3092                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3093      #else
3094       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3095       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3096                                mat->num_rows, mat->num_cols,
3097                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3098                                mat->values->data().get(), mat->row_offsets->data().get(),
3099                                mat->column_indices->data().get(), xptr, beta,
3100                                dptr);CHKERRCUSPARSE(stat);
3101      #endif
3102     } else {
3103       if (cusparsestruct->nrows) {
3104        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3105         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3106        #else
3107         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3108         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3109                                  matstruct->alpha_one, matstruct->descr, hybMat,
3110                                  xptr, beta,
3111                                  dptr);CHKERRCUSPARSE(stat);
3112        #endif
3113       }
3114     }
3115     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3116 
3117     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3118       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3119         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3120           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3121         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3122           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3123         }
3124       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3125         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3126       }
3127 
3128       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3129       if (compressed) {
3130         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3131         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3132            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3133            prevent that. So I just add a ScatterAdd kernel.
3134          */
3135        #if 0
3136         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3137         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3138                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3139                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3140                          VecCUDAPlusEquals());
3141        #else
3142         PetscInt n = matstruct->cprowIndices->size();
3143         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3144        #endif
3145         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3146       }
3147     } else {
3148       if (yy && yy != zz) {
3149         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3150       }
3151     }
3152     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3153     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3154     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3155   } catch(char *ex) {
3156     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3157   }
3158   if (yy) {
3159     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3160   } else {
3161     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3162   }
3163   PetscFunctionReturn(0);
3164 }
3165 
3166 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3167 {
3168   PetscErrorCode ierr;
3169 
3170   PetscFunctionBegin;
3171   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3172   PetscFunctionReturn(0);
3173 }
3174 
3175 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3176 {
3177   PetscErrorCode     ierr;
3178   PetscObjectState   onnz = A->nonzerostate;
3179   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3180 
3181   PetscFunctionBegin;
3182   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3183   if (onnz != A->nonzerostate && cusp->deviceMat) {
3184     cudaError_t cerr;
3185 
3186     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3187     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3188     cusp->deviceMat = NULL;
3189   }
3190   PetscFunctionReturn(0);
3191 }
3192 
3193 /* --------------------------------------------------------------------------------*/
3194 /*@
3195    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3196    (the default parallel PETSc format). This matrix will ultimately pushed down
3197    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3198    assembly performance the user should preallocate the matrix storage by setting
3199    the parameter nz (or the array nnz).  By setting these parameters accurately,
3200    performance during matrix assembly can be increased by more than a factor of 50.
3201 
3202    Collective
3203 
3204    Input Parameters:
3205 +  comm - MPI communicator, set to PETSC_COMM_SELF
3206 .  m - number of rows
3207 .  n - number of columns
3208 .  nz - number of nonzeros per row (same for all rows)
3209 -  nnz - array containing the number of nonzeros in the various rows
3210          (possibly different for each row) or NULL
3211 
3212    Output Parameter:
3213 .  A - the matrix
3214 
3215    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3216    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3217    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3218 
3219    Notes:
3220    If nnz is given then nz is ignored
3221 
3222    The AIJ format (also called the Yale sparse matrix format or
3223    compressed row storage), is fully compatible with standard Fortran 77
3224    storage.  That is, the stored row and column indices can begin at
3225    either one (as in Fortran) or zero.  See the users' manual for details.
3226 
3227    Specify the preallocated storage with either nz or nnz (not both).
3228    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3229    allocation.  For large problems you MUST preallocate memory or you
3230    will get TERRIBLE performance, see the users' manual chapter on matrices.
3231 
3232    By default, this format uses inodes (identical nodes) when possible, to
3233    improve numerical efficiency of matrix-vector products and solves. We
3234    search for consecutive rows with the same nonzero structure, thereby
3235    reusing matrix information to achieve increased efficiency.
3236 
3237    Level: intermediate
3238 
3239 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3240 @*/
3241 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3242 {
3243   PetscErrorCode ierr;
3244 
3245   PetscFunctionBegin;
3246   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3247   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3248   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3249   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3254 {
3255   PetscErrorCode ierr;
3256 
3257   PetscFunctionBegin;
3258   if (A->factortype == MAT_FACTOR_NONE) {
3259     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3260   } else {
3261     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3262   }
3263   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3264   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3265   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3266   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3267   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3268   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3269   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3270   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3271   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3272   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3273   PetscFunctionReturn(0);
3274 }
3275 
3276 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3277 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3278 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3279 {
3280   PetscErrorCode ierr;
3281 
3282   PetscFunctionBegin;
3283   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3284   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3285   PetscFunctionReturn(0);
3286 }
3287 
3288 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3289 {
3290   PetscErrorCode     ierr;
3291   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3292   Mat_SeqAIJCUSPARSE *cy;
3293   Mat_SeqAIJCUSPARSE *cx;
3294   PetscScalar        *ay;
3295   const PetscScalar  *ax;
3296   CsrMatrix          *csry,*csrx;
3297 
3298   PetscFunctionBegin;
3299   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3300   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3301   if (X->ops->axpy != Y->ops->axpy) {
3302     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3303     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3304     PetscFunctionReturn(0);
3305   }
3306   /* if we are here, it means both matrices are bound to GPU */
3307   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3308   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3309   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3310   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3311   csry = (CsrMatrix*)cy->mat->mat;
3312   csrx = (CsrMatrix*)cx->mat->mat;
3313   /* see if we can turn this into a cublas axpy */
3314   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3315     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3316     if (eq) {
3317       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3318     }
3319     if (eq) str = SAME_NONZERO_PATTERN;
3320   }
3321   /* spgeam is buggy with one column */
3322   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3323 
3324   if (str == SUBSET_NONZERO_PATTERN) {
3325     cusparseStatus_t stat;
3326     PetscScalar      b = 1.0;
3327 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3328     size_t           bufferSize;
3329     void             *buffer;
3330     cudaError_t      cerr;
3331 #endif
3332 
3333     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3334     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3335     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3336 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3337     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3338                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3339                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3340                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3341     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3342     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3343     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3344                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3345                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3346                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3347     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3348     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3349     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3350 #else
3351     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3352     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3353                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3354                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3355                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3356     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3357     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3358 #endif
3359     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3360     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3361     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3362     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3363   } else if (str == SAME_NONZERO_PATTERN) {
3364     cublasHandle_t cublasv2handle;
3365     cublasStatus_t berr;
3366     PetscBLASInt   one = 1, bnz = 1;
3367 
3368     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3369     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3370     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3371     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3372     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3373     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3374     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3375     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3376     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3377     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3378     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3379   } else {
3380     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3381     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3382   }
3383   PetscFunctionReturn(0);
3384 }
3385 
3386 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3387 {
3388   PetscErrorCode ierr;
3389   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3390   PetscScalar    *ay;
3391   cublasHandle_t cublasv2handle;
3392   cublasStatus_t berr;
3393   PetscBLASInt   one = 1, bnz = 1;
3394 
3395   PetscFunctionBegin;
3396   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3397   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3398   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3399   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3400   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3401   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3402   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3403   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3404   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3405   PetscFunctionReturn(0);
3406 }
3407 
3408 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3409 {
3410   PetscErrorCode ierr;
3411   PetscBool      both = PETSC_FALSE;
3412   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3413 
3414   PetscFunctionBegin;
3415   if (A->factortype == MAT_FACTOR_NONE) {
3416     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3417     if (spptr->mat) {
3418       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3419       if (matrix->values) {
3420         both = PETSC_TRUE;
3421         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3422       }
3423     }
3424     if (spptr->matTranspose) {
3425       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3426       if (matrix->values) {
3427         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3428       }
3429     }
3430   }
3431   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3432   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3433   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3434   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3435   else A->offloadmask = PETSC_OFFLOAD_CPU;
3436   PetscFunctionReturn(0);
3437 }
3438 
3439 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3440 {
3441   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3442   PetscErrorCode ierr;
3443 
3444   PetscFunctionBegin;
3445   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3446   if (flg) {
3447     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3448 
3449     A->ops->scale                     = MatScale_SeqAIJ;
3450     A->ops->axpy                      = MatAXPY_SeqAIJ;
3451     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3452     A->ops->mult                      = MatMult_SeqAIJ;
3453     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3454     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3455     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3456     A->ops->multhermitiantranspose    = NULL;
3457     A->ops->multhermitiantransposeadd = NULL;
3458     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3459     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3460     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3461     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3462     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3463     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3464     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3465     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3466   } else {
3467     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3468     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3469     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3470     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3471     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3472     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3473     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3474     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3475     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3476     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3477     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3478     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3479     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3480     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3481     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3482     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3483     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3484   }
3485   A->boundtocpu = flg;
3486   if (flg && a->inode.size) {
3487     a->inode.use = PETSC_TRUE;
3488   } else {
3489     a->inode.use = PETSC_FALSE;
3490   }
3491   PetscFunctionReturn(0);
3492 }
3493 
3494 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3495 {
3496   PetscErrorCode   ierr;
3497   cusparseStatus_t stat;
3498   Mat              B;
3499 
3500   PetscFunctionBegin;
3501   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3502   if (reuse == MAT_INITIAL_MATRIX) {
3503     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3504   } else if (reuse == MAT_REUSE_MATRIX) {
3505     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3506   }
3507   B = *newmat;
3508 
3509   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3510   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3511 
3512   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3513     if (B->factortype == MAT_FACTOR_NONE) {
3514       Mat_SeqAIJCUSPARSE *spptr;
3515       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3516       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3517       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3518       spptr->format     = MAT_CUSPARSE_CSR;
3519      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3520      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3521       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3522      #else
3523       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3524      #endif
3525       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3526       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3527      #endif
3528       B->spptr = spptr;
3529     } else {
3530       Mat_SeqAIJCUSPARSETriFactors *spptr;
3531 
3532       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3533       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3534       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3535       B->spptr = spptr;
3536     }
3537     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3538   }
3539   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3540   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3541   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3542   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3543   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3544   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3545 
3546   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3547   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3548   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3549 #if defined(PETSC_HAVE_HYPRE)
3550   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3551 #endif
3552   PetscFunctionReturn(0);
3553 }
3554 
3555 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3556 {
3557   PetscErrorCode ierr;
3558 
3559   PetscFunctionBegin;
3560   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3561   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3562   PetscFunctionReturn(0);
3563 }
3564 
3565 /*MC
3566    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3567 
3568    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3569    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3570    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3571 
3572    Options Database Keys:
3573 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3574 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3575 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3576 
3577   Level: beginner
3578 
3579 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3580 M*/
3581 
3582 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3583 
3584 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3585 {
3586   PetscErrorCode ierr;
3587 
3588   PetscFunctionBegin;
3589   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3590   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3591   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3592   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3593   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3594 
3595   PetscFunctionReturn(0);
3596 }
3597 
3598 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3599 {
3600   PetscErrorCode   ierr;
3601   cusparseStatus_t stat;
3602 
3603   PetscFunctionBegin;
3604   if (*cusparsestruct) {
3605     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3606     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3607     delete (*cusparsestruct)->workVector;
3608     delete (*cusparsestruct)->rowoffsets_gpu;
3609     delete (*cusparsestruct)->cooPerm;
3610     delete (*cusparsestruct)->cooPerm_a;
3611     delete (*cusparsestruct)->csr2csc_i;
3612     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3613     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3614   }
3615   PetscFunctionReturn(0);
3616 }
3617 
3618 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3619 {
3620   PetscFunctionBegin;
3621   if (*mat) {
3622     delete (*mat)->values;
3623     delete (*mat)->column_indices;
3624     delete (*mat)->row_offsets;
3625     delete *mat;
3626     *mat = 0;
3627   }
3628   PetscFunctionReturn(0);
3629 }
3630 
3631 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3632 {
3633   cusparseStatus_t stat;
3634   PetscErrorCode   ierr;
3635 
3636   PetscFunctionBegin;
3637   if (*trifactor) {
3638     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3639     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3640     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3641     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3642     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3643    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3644     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3645    #endif
3646     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3647   }
3648   PetscFunctionReturn(0);
3649 }
3650 
3651 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3652 {
3653   CsrMatrix        *mat;
3654   cusparseStatus_t stat;
3655   cudaError_t      err;
3656 
3657   PetscFunctionBegin;
3658   if (*matstruct) {
3659     if ((*matstruct)->mat) {
3660       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3661        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3662         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3663        #else
3664         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3665         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3666        #endif
3667       } else {
3668         mat = (CsrMatrix*)(*matstruct)->mat;
3669         CsrMatrix_Destroy(&mat);
3670       }
3671     }
3672     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3673     delete (*matstruct)->cprowIndices;
3674     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3675     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3676     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3677 
3678    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3679     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3680     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3681     for (int i=0; i<3; i++) {
3682       if (mdata->cuSpMV[i].initialized) {
3683         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3684         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3685         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3686       }
3687     }
3688    #endif
3689     delete *matstruct;
3690     *matstruct = NULL;
3691   }
3692   PetscFunctionReturn(0);
3693 }
3694 
3695 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3696 {
3697   PetscErrorCode ierr;
3698 
3699   PetscFunctionBegin;
3700   if (*trifactors) {
3701     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3702     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3703     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3704     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3705     delete (*trifactors)->rpermIndices;
3706     delete (*trifactors)->cpermIndices;
3707     delete (*trifactors)->workVector;
3708     (*trifactors)->rpermIndices = NULL;
3709     (*trifactors)->cpermIndices = NULL;
3710     (*trifactors)->workVector = NULL;
3711     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3712     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3713     (*trifactors)->init_dev_prop = PETSC_FALSE;
3714   }
3715   PetscFunctionReturn(0);
3716 }
3717 
3718 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3719 {
3720   PetscErrorCode   ierr;
3721   cusparseHandle_t handle;
3722   cusparseStatus_t stat;
3723 
3724   PetscFunctionBegin;
3725   if (*trifactors) {
3726     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3727     if (handle = (*trifactors)->handle) {
3728       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3729     }
3730     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3731   }
3732   PetscFunctionReturn(0);
3733 }
3734 
3735 struct IJCompare
3736 {
3737   __host__ __device__
3738   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3739   {
3740     if (t1.get<0>() < t2.get<0>()) return true;
3741     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3742     return false;
3743   }
3744 };
3745 
3746 struct IJEqual
3747 {
3748   __host__ __device__
3749   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3750   {
3751     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3752     return true;
3753   }
3754 };
3755 
3756 struct IJDiff
3757 {
3758   __host__ __device__
3759   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3760   {
3761     return t1 == t2 ? 0 : 1;
3762   }
3763 };
3764 
3765 struct IJSum
3766 {
3767   __host__ __device__
3768   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3769   {
3770     return t1||t2;
3771   }
3772 };
3773 
3774 #include <thrust/iterator/discard_iterator.h>
3775 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3776 {
3777   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3778   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3779   THRUSTARRAY                           *cooPerm_v = NULL;
3780   thrust::device_ptr<const PetscScalar> d_v;
3781   CsrMatrix                             *matrix;
3782   PetscErrorCode                        ierr;
3783   PetscInt                              n;
3784 
3785   PetscFunctionBegin;
3786   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3787   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3788   if (!cusp->cooPerm) {
3789     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3790     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3791     PetscFunctionReturn(0);
3792   }
3793   matrix = (CsrMatrix*)cusp->mat->mat;
3794   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3795   if (!v) {
3796     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3797     goto finalize;
3798   }
3799   n = cusp->cooPerm->size();
3800   if (isCudaMem(v)) {
3801     d_v = thrust::device_pointer_cast(v);
3802   } else {
3803     cooPerm_v = new THRUSTARRAY(n);
3804     cooPerm_v->assign(v,v+n);
3805     d_v = cooPerm_v->data();
3806     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3807   }
3808   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3809   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3810     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3811       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3812       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3813       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3814         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3815         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3816       */
3817       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3818       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3819       delete cooPerm_w;
3820     } else {
3821       /* all nonzeros in d_v[] are unique entries */
3822       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3823                                                                 matrix->values->begin()));
3824       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3825                                                                 matrix->values->end()));
3826       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3827     }
3828   } else {
3829     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3830       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3831       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3832     } else {
3833       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3834                                                                 matrix->values->begin()));
3835       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3836                                                                 matrix->values->end()));
3837       thrust::for_each(zibit,zieit,VecCUDAEquals());
3838     }
3839   }
3840   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3841 finalize:
3842   delete cooPerm_v;
3843   A->offloadmask = PETSC_OFFLOAD_GPU;
3844   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3845   /* shorter version of MatAssemblyEnd_SeqAIJ */
3846   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3847   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3848   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3849   a->reallocs         = 0;
3850   A->info.mallocs    += 0;
3851   A->info.nz_unneeded = 0;
3852   A->assembled = A->was_assembled = PETSC_TRUE;
3853   A->num_ass++;
3854   PetscFunctionReturn(0);
3855 }
3856 
3857 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3858 {
3859   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3860   PetscErrorCode     ierr;
3861 
3862   PetscFunctionBegin;
3863   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3864   if (!cusp) PetscFunctionReturn(0);
3865   if (destroy) {
3866     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3867     delete cusp->csr2csc_i;
3868     cusp->csr2csc_i = NULL;
3869   }
3870   A->transupdated = PETSC_FALSE;
3871   PetscFunctionReturn(0);
3872 }
3873 
3874 #include <thrust/binary_search.h>
3875 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3876 {
3877   PetscErrorCode     ierr;
3878   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3879   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3880   PetscInt           cooPerm_n, nzr = 0;
3881   cudaError_t        cerr;
3882 
3883   PetscFunctionBegin;
3884   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3885   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3886   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3887   if (n != cooPerm_n) {
3888     delete cusp->cooPerm;
3889     delete cusp->cooPerm_a;
3890     cusp->cooPerm = NULL;
3891     cusp->cooPerm_a = NULL;
3892   }
3893   if (n) {
3894     THRUSTINTARRAY d_i(n);
3895     THRUSTINTARRAY d_j(n);
3896     THRUSTINTARRAY ii(A->rmap->n);
3897 
3898     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3899     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3900 
3901     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3902     d_i.assign(coo_i,coo_i+n);
3903     d_j.assign(coo_j,coo_j+n);
3904 
3905     /* Ex.
3906       n = 6
3907       coo_i = [3,3,1,4,1,4]
3908       coo_j = [3,2,2,5,2,6]
3909     */
3910     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3911     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3912 
3913     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3914     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3915     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3916     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3917     THRUSTINTARRAY w = d_j;
3918 
3919     /*
3920       d_i     = [1,1,3,3,4,4]
3921       d_j     = [2,2,2,3,5,6]
3922       cooPerm = [2,4,1,0,3,5]
3923     */
3924     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3925 
3926     /*
3927       d_i     = [1,3,3,4,4,x]
3928                             ^ekey
3929       d_j     = [2,2,3,5,6,x]
3930                            ^nekye
3931     */
3932     if (nekey == ekey) { /* all entries are unique */
3933       delete cusp->cooPerm_a;
3934       cusp->cooPerm_a = NULL;
3935     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3936       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3937       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3938       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3939       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3940       w[0] = 0;
3941       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3942       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3943     }
3944     thrust::counting_iterator<PetscInt> search_begin(0);
3945     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3946                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3947                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3948     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3949 
3950     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3951     a->singlemalloc = PETSC_FALSE;
3952     a->free_a       = PETSC_TRUE;
3953     a->free_ij      = PETSC_TRUE;
3954     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3955     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3956     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3957     a->nz = a->maxnz = a->i[A->rmap->n];
3958     a->rmax = 0;
3959     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3960     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3961     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3962     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3963     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3964     for (PetscInt i = 0; i < A->rmap->n; i++) {
3965       const PetscInt nnzr = a->i[i+1] - a->i[i];
3966       nzr += (PetscInt)!!(nnzr);
3967       a->ilen[i] = a->imax[i] = nnzr;
3968       a->rmax = PetscMax(a->rmax,nnzr);
3969     }
3970     a->nonzerorowcnt = nzr;
3971     A->preallocated = PETSC_TRUE;
3972     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3973     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3974   } else {
3975     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3976   }
3977   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3978 
3979   /* We want to allocate the CUSPARSE struct for matvec now.
3980      The code is so convoluted now that I prefer to copy zeros */
3981   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3982   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3983   A->offloadmask = PETSC_OFFLOAD_CPU;
3984   A->nonzerostate++;
3985   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3986   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3987 
3988   A->assembled = PETSC_FALSE;
3989   A->was_assembled = PETSC_FALSE;
3990   PetscFunctionReturn(0);
3991 }
3992 
3993 /*@C
3994     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3995 
3996    Not collective
3997 
3998     Input Parameters:
3999 +   A - the matrix
4000 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4001 
4002     Output Parameters:
4003 +   ia - the CSR row pointers
4004 -   ja - the CSR column indices
4005 
4006     Level: developer
4007 
4008     Notes:
4009       When compressed is true, the CSR structure does not contain empty rows
4010 
4011 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
4012 @*/
4013 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4014 {
4015   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4016   CsrMatrix          *csr;
4017   PetscErrorCode     ierr;
4018   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4019 
4020   PetscFunctionBegin;
4021   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4022   if (!i || !j) PetscFunctionReturn(0);
4023   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4024   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4025   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4026   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4027   csr = (CsrMatrix*)cusp->mat->mat;
4028   if (i) {
4029     if (!compressed && a->compressedrow.use) { /* need full row offset */
4030       if (!cusp->rowoffsets_gpu) {
4031         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4032         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4033         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4034       }
4035       *i = cusp->rowoffsets_gpu->data().get();
4036     } else *i = csr->row_offsets->data().get();
4037   }
4038   if (j) *j = csr->column_indices->data().get();
4039   PetscFunctionReturn(0);
4040 }
4041 
4042 /*@C
4043     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4044 
4045    Not collective
4046 
4047     Input Parameters:
4048 +   A - the matrix
4049 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4050 
4051     Output Parameters:
4052 +   ia - the CSR row pointers
4053 -   ja - the CSR column indices
4054 
4055     Level: developer
4056 
4057 .seealso: MatSeqAIJCUSPARSEGetIJ()
4058 @*/
4059 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4060 {
4061   PetscFunctionBegin;
4062   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4063   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4064   if (i) *i = NULL;
4065   if (j) *j = NULL;
4066   PetscFunctionReturn(0);
4067 }
4068 
4069 /*@C
4070    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4071 
4072    Not Collective
4073 
4074    Input Parameter:
4075 .   A - a MATSEQAIJCUSPARSE matrix
4076 
4077    Output Parameter:
4078 .   a - pointer to the device data
4079 
4080    Level: developer
4081 
4082    Notes: may trigger host-device copies if up-to-date matrix data is on host
4083 
4084 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4085 @*/
4086 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4087 {
4088   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4089   CsrMatrix          *csr;
4090   PetscErrorCode     ierr;
4091 
4092   PetscFunctionBegin;
4093   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4094   PetscValidPointer(a,2);
4095   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4096   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4097   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4098   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4099   csr = (CsrMatrix*)cusp->mat->mat;
4100   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4101   *a = csr->values->data().get();
4102   PetscFunctionReturn(0);
4103 }
4104 
4105 /*@C
4106    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4107 
4108    Not Collective
4109 
4110    Input Parameter:
4111 .   A - a MATSEQAIJCUSPARSE matrix
4112 
4113    Output Parameter:
4114 .   a - pointer to the device data
4115 
4116    Level: developer
4117 
4118 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4119 @*/
4120 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4121 {
4122   PetscFunctionBegin;
4123   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4124   PetscValidPointer(a,2);
4125   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4126   *a = NULL;
4127   PetscFunctionReturn(0);
4128 }
4129 
4130 /*@C
4131    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4132 
4133    Not Collective
4134 
4135    Input Parameter:
4136 .   A - a MATSEQAIJCUSPARSE matrix
4137 
4138    Output Parameter:
4139 .   a - pointer to the device data
4140 
4141    Level: developer
4142 
4143    Notes: may trigger host-device copies if up-to-date matrix data is on host
4144 
4145 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4146 @*/
4147 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4148 {
4149   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4150   CsrMatrix          *csr;
4151   PetscErrorCode     ierr;
4152 
4153   PetscFunctionBegin;
4154   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4155   PetscValidPointer(a,2);
4156   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4157   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4158   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4159   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4160   csr = (CsrMatrix*)cusp->mat->mat;
4161   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4162   *a = csr->values->data().get();
4163   A->offloadmask = PETSC_OFFLOAD_GPU;
4164   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4165   PetscFunctionReturn(0);
4166 }
4167 /*@C
4168    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4169 
4170    Not Collective
4171 
4172    Input Parameter:
4173 .   A - a MATSEQAIJCUSPARSE matrix
4174 
4175    Output Parameter:
4176 .   a - pointer to the device data
4177 
4178    Level: developer
4179 
4180 .seealso: MatSeqAIJCUSPARSEGetArray()
4181 @*/
4182 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4183 {
4184   PetscErrorCode ierr;
4185 
4186   PetscFunctionBegin;
4187   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4188   PetscValidPointer(a,2);
4189   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4190   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4191   *a = NULL;
4192   PetscFunctionReturn(0);
4193 }
4194 
4195 /*@C
4196    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4197 
4198    Not Collective
4199 
4200    Input Parameter:
4201 .   A - a MATSEQAIJCUSPARSE matrix
4202 
4203    Output Parameter:
4204 .   a - pointer to the device data
4205 
4206    Level: developer
4207 
4208    Notes: does not trigger host-device copies and flags data validity on the GPU
4209 
4210 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4211 @*/
4212 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4213 {
4214   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4215   CsrMatrix          *csr;
4216   PetscErrorCode     ierr;
4217 
4218   PetscFunctionBegin;
4219   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4220   PetscValidPointer(a,2);
4221   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4222   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4223   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4224   csr = (CsrMatrix*)cusp->mat->mat;
4225   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4226   *a = csr->values->data().get();
4227   A->offloadmask = PETSC_OFFLOAD_GPU;
4228   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4229   PetscFunctionReturn(0);
4230 }
4231 
4232 /*@C
4233    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4234 
4235    Not Collective
4236 
4237    Input Parameter:
4238 .   A - a MATSEQAIJCUSPARSE matrix
4239 
4240    Output Parameter:
4241 .   a - pointer to the device data
4242 
4243    Level: developer
4244 
4245 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4246 @*/
4247 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4248 {
4249   PetscErrorCode ierr;
4250 
4251   PetscFunctionBegin;
4252   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4253   PetscValidPointer(a,2);
4254   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4255   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4256   *a = NULL;
4257   PetscFunctionReturn(0);
4258 }
4259 
4260 struct IJCompare4
4261 {
4262   __host__ __device__
4263   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4264   {
4265     if (t1.get<0>() < t2.get<0>()) return true;
4266     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4267     return false;
4268   }
4269 };
4270 
4271 struct Shift
4272 {
4273   int _shift;
4274 
4275   Shift(int shift) : _shift(shift) {}
4276   __host__ __device__
4277   inline int operator() (const int &c)
4278   {
4279     return c + _shift;
4280   }
4281 };
4282 
4283 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4284 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4285 {
4286   PetscErrorCode               ierr;
4287   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4288   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4289   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4290   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4291   PetscInt                     Annz,Bnnz;
4292   cusparseStatus_t             stat;
4293   PetscInt                     i,m,n,zero = 0;
4294   cudaError_t                  cerr;
4295 
4296   PetscFunctionBegin;
4297   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4298   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4299   PetscValidPointer(C,4);
4300   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4301   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4302   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4303   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4304   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4305   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4306   if (reuse == MAT_INITIAL_MATRIX) {
4307     m     = A->rmap->n;
4308     n     = A->cmap->n + B->cmap->n;
4309     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4310     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4311     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4312     c     = (Mat_SeqAIJ*)(*C)->data;
4313     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4314     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4315     Ccsr  = new CsrMatrix;
4316     Cmat->cprowIndices      = NULL;
4317     c->compressedrow.use    = PETSC_FALSE;
4318     c->compressedrow.nrows  = 0;
4319     c->compressedrow.i      = NULL;
4320     c->compressedrow.rindex = NULL;
4321     Ccusp->workVector       = NULL;
4322     Ccusp->nrows    = m;
4323     Ccusp->mat      = Cmat;
4324     Ccusp->mat->mat = Ccsr;
4325     Ccsr->num_rows  = m;
4326     Ccsr->num_cols  = n;
4327     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4328     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4329     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4330     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4331     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4332     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4333     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4334     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4335     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4336     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4337     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4338     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4339     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4340 
4341     Acsr = (CsrMatrix*)Acusp->mat->mat;
4342     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4343     Annz = (PetscInt)Acsr->column_indices->size();
4344     Bnnz = (PetscInt)Bcsr->column_indices->size();
4345     c->nz = Annz + Bnnz;
4346     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4347     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4348     Ccsr->values = new THRUSTARRAY(c->nz);
4349     Ccsr->num_entries = c->nz;
4350     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4351     if (c->nz) {
4352       auto Acoo = new THRUSTINTARRAY32(Annz);
4353       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4354       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4355       THRUSTINTARRAY32 *Aroff,*Broff;
4356 
4357       if (a->compressedrow.use) { /* need full row offset */
4358         if (!Acusp->rowoffsets_gpu) {
4359           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4360           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4361           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4362         }
4363         Aroff = Acusp->rowoffsets_gpu;
4364       } else Aroff = Acsr->row_offsets;
4365       if (b->compressedrow.use) { /* need full row offset */
4366         if (!Bcusp->rowoffsets_gpu) {
4367           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4368           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4369           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4370         }
4371         Broff = Bcusp->rowoffsets_gpu;
4372       } else Broff = Bcsr->row_offsets;
4373       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4374       stat = cusparseXcsr2coo(Acusp->handle,
4375                               Aroff->data().get(),
4376                               Annz,
4377                               m,
4378                               Acoo->data().get(),
4379                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4380       stat = cusparseXcsr2coo(Bcusp->handle,
4381                               Broff->data().get(),
4382                               Bnnz,
4383                               m,
4384                               Bcoo->data().get(),
4385                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4386       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4387       auto Aperm = thrust::make_constant_iterator(1);
4388       auto Bperm = thrust::make_constant_iterator(0);
4389 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4390       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4391       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4392 #else
4393       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4394       auto Bcib = Bcsr->column_indices->begin();
4395       auto Bcie = Bcsr->column_indices->end();
4396       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4397 #endif
4398       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4399       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4400       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4401       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4402       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4403       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4404       auto p1 = Ccusp->cooPerm->begin();
4405       auto p2 = Ccusp->cooPerm->begin();
4406       thrust::advance(p2,Annz);
4407       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4408 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4409       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4410 #endif
4411       auto cci = thrust::make_counting_iterator(zero);
4412       auto cce = thrust::make_counting_iterator(c->nz);
4413 #if 0 //Errors on SUMMIT cuda 11.1.0
4414       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4415 #else
4416       auto pred = thrust::identity<int>();
4417       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4418       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4419 #endif
4420       stat = cusparseXcoo2csr(Ccusp->handle,
4421                               Ccoo->data().get(),
4422                               c->nz,
4423                               m,
4424                               Ccsr->row_offsets->data().get(),
4425                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4426       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4427       delete wPerm;
4428       delete Acoo;
4429       delete Bcoo;
4430       delete Ccoo;
4431 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4432       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4433                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4434                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4435                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4436 #endif
4437       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4438         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4439         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4440         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4441         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4442         CsrMatrix *CcsrT = new CsrMatrix;
4443         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4444         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4445 
4446         (*C)->form_explicit_transpose = PETSC_TRUE;
4447         (*C)->transupdated = PETSC_TRUE;
4448         Ccusp->rowoffsets_gpu = NULL;
4449         CmatT->cprowIndices = NULL;
4450         CmatT->mat = CcsrT;
4451         CcsrT->num_rows = n;
4452         CcsrT->num_cols = m;
4453         CcsrT->num_entries = c->nz;
4454 
4455         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4456         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4457         CcsrT->values = new THRUSTARRAY(c->nz);
4458 
4459         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4460         auto rT = CcsrT->row_offsets->begin();
4461         if (AT) {
4462           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4463           thrust::advance(rT,-1);
4464         }
4465         if (BT) {
4466           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4467           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4468           thrust::copy(titb,tite,rT);
4469         }
4470         auto cT = CcsrT->column_indices->begin();
4471         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4472         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4473         auto vT = CcsrT->values->begin();
4474         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4475         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4476         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4477 
4478         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4479         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4480         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4481         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4482         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4483         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4484         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4485         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4486         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4488         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4489                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4490                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4491                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4492 #endif
4493         Ccusp->matTranspose = CmatT;
4494       }
4495     }
4496 
4497     c->singlemalloc = PETSC_FALSE;
4498     c->free_a       = PETSC_TRUE;
4499     c->free_ij      = PETSC_TRUE;
4500     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4501     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4502     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4503       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4504       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4505       ii   = *Ccsr->row_offsets;
4506       jj   = *Ccsr->column_indices;
4507       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4508       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4509     } else {
4510       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4511       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4512     }
4513     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4514     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4515     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4516     c->maxnz = c->nz;
4517     c->nonzerorowcnt = 0;
4518     c->rmax = 0;
4519     for (i = 0; i < m; i++) {
4520       const PetscInt nn = c->i[i+1] - c->i[i];
4521       c->ilen[i] = c->imax[i] = nn;
4522       c->nonzerorowcnt += (PetscInt)!!nn;
4523       c->rmax = PetscMax(c->rmax,nn);
4524     }
4525     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4526     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4527     (*C)->nonzerostate++;
4528     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4529     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4530     Ccusp->nonzerostate = (*C)->nonzerostate;
4531     (*C)->preallocated  = PETSC_TRUE;
4532   } else {
4533     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4534     c = (Mat_SeqAIJ*)(*C)->data;
4535     if (c->nz) {
4536       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4537       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4538       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4539       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4540       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4541       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4542       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4543       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4544       Acsr = (CsrMatrix*)Acusp->mat->mat;
4545       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4546       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4547       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4548       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4549       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4550       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4551       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4552       auto pmid = Ccusp->cooPerm->begin();
4553       thrust::advance(pmid,Acsr->num_entries);
4554       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4555       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4556                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4557       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4558                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4559       thrust::for_each(zibait,zieait,VecCUDAEquals());
4560       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4561                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4562       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4563                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4564       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4565       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4566       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4567         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4568         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4569         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4570         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4571         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4572         auto vT = CcsrT->values->begin();
4573         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4574         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4575         (*C)->transupdated = PETSC_TRUE;
4576       }
4577       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4578     }
4579   }
4580   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4581   (*C)->assembled     = PETSC_TRUE;
4582   (*C)->was_assembled = PETSC_FALSE;
4583   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4584   PetscFunctionReturn(0);
4585 }
4586 
4587 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4588 {
4589   PetscErrorCode    ierr;
4590   bool              dmem;
4591   const PetscScalar *av;
4592   cudaError_t       cerr;
4593 
4594   PetscFunctionBegin;
4595   dmem = isCudaMem(v);
4596   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4597   if (n && idx) {
4598     THRUSTINTARRAY widx(n);
4599     widx.assign(idx,idx+n);
4600     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4601 
4602     THRUSTARRAY *w = NULL;
4603     thrust::device_ptr<PetscScalar> dv;
4604     if (dmem) {
4605       dv = thrust::device_pointer_cast(v);
4606     } else {
4607       w = new THRUSTARRAY(n);
4608       dv = w->data();
4609     }
4610     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4611 
4612     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4613     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4614     thrust::for_each(zibit,zieit,VecCUDAEquals());
4615     if (w) {
4616       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4617     }
4618     delete w;
4619   } else {
4620     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4621   }
4622   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4623   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4624   PetscFunctionReturn(0);
4625 }
4626