xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a06ef8240a5969b9b142e0177c46f61fb8e1a643)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302 #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304 #else
305     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306 #endif
307     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
309     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
310 
311     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
313     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314    #endif
315   }
316   ierr = PetscOptionsTail();CHKERRQ(ierr);
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
328   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
340   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345 {
346   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347   PetscErrorCode               ierr;
348 
349   PetscFunctionBegin;
350   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
351   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
352   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353   PetscFunctionReturn(0);
354 }
355 
356 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357 {
358   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359   PetscErrorCode               ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
363   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
364   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365   PetscFunctionReturn(0);
366 }
367 
368 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
369 {
370   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
371   PetscInt                          n = A->rmap->n;
372   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
374   cusparseStatus_t                  stat;
375   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
376   const MatScalar                   *aa = a->a,*v;
377   PetscInt                          *AiLo, *AjLo;
378   PetscInt                          i,nz, nzLower, offset, rowOffset;
379   PetscErrorCode                    ierr;
380   cudaError_t                       cerr;
381 
382   PetscFunctionBegin;
383   if (!n) PetscFunctionReturn(0);
384   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
385     try {
386       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
387       nzLower=n+ai[n]-ai[1];
388       if (!loTriFactor) {
389         PetscScalar                       *AALo;
390 
391         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
392 
393         /* Allocate Space for the lower triangular matrix */
394         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
395         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
396 
397         /* Fill the lower triangular matrix */
398         AiLo[0]  = (PetscInt) 0;
399         AiLo[n]  = nzLower;
400         AjLo[0]  = (PetscInt) 0;
401         AALo[0]  = (MatScalar) 1.0;
402         v        = aa;
403         vi       = aj;
404         offset   = 1;
405         rowOffset= 1;
406         for (i=1; i<n; i++) {
407           nz = ai[i+1] - ai[i];
408           /* additional 1 for the term on the diagonal */
409           AiLo[i]    = rowOffset;
410           rowOffset += nz+1;
411 
412           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
413           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
414 
415           offset      += nz;
416           AjLo[offset] = (PetscInt) i;
417           AALo[offset] = (MatScalar) 1.0;
418           offset      += 1;
419 
420           v  += nz;
421           vi += nz;
422         }
423 
424         /* allocate space for the triangular factor information */
425         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
426         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427         /* Create the matrix description */
428         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
429         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
430        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432        #else
433         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434        #endif
435         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
436         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
437 
438         /* set the operation */
439         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
440 
441         /* set the matrix */
442         loTriFactor->csrMat = new CsrMatrix;
443         loTriFactor->csrMat->num_rows = n;
444         loTriFactor->csrMat->num_cols = n;
445         loTriFactor->csrMat->num_entries = nzLower;
446 
447         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
449 
450         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
452 
453         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
455 
456         /* Create the solve analysis information */
457         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
459       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466       #endif
467 
468         /* perform the solve analysis */
469         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472                                  loTriFactor->csrMat->column_indices->data().get(),
473                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474                                  loTriFactor->solveInfo,
475                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476                                #else
477                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478                                #endif
479         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
481 
482         /* assign the pointer */
483         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
484         loTriFactor->AA_h = AALo;
485         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
486         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
487         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488       } else { /* update values only */
489         if (!loTriFactor->AA_h) {
490           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
491         }
492         /* Fill the lower triangular matrix */
493         loTriFactor->AA_h[0]  = 1.0;
494         v        = aa;
495         vi       = aj;
496         offset   = 1;
497         for (i=1; i<n; i++) {
498           nz = ai[i+1] - ai[i];
499           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
500           offset      += nz;
501           loTriFactor->AA_h[offset] = 1.0;
502           offset      += 1;
503           v  += nz;
504         }
505         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
507       }
508     } catch(char *ex) {
509       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
510     }
511   }
512   PetscFunctionReturn(0);
513 }
514 
515 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
516 {
517   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
518   PetscInt                          n = A->rmap->n;
519   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
521   cusparseStatus_t                  stat;
522   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
523   const MatScalar                   *aa = a->a,*v;
524   PetscInt                          *AiUp, *AjUp;
525   PetscInt                          i,nz, nzUpper, offset;
526   PetscErrorCode                    ierr;
527   cudaError_t                       cerr;
528 
529   PetscFunctionBegin;
530   if (!n) PetscFunctionReturn(0);
531   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
532     try {
533       /* next, figure out the number of nonzeros in the upper triangular matrix. */
534       nzUpper = adiag[0]-adiag[n];
535       if (!upTriFactor) {
536         PetscScalar *AAUp;
537 
538         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
539 
540         /* Allocate Space for the upper triangular matrix */
541         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
542         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
543 
544         /* Fill the upper triangular matrix */
545         AiUp[0]=(PetscInt) 0;
546         AiUp[n]=nzUpper;
547         offset = nzUpper;
548         for (i=n-1; i>=0; i--) {
549           v  = aa + adiag[i+1] + 1;
550           vi = aj + adiag[i+1] + 1;
551 
552           /* number of elements NOT on the diagonal */
553           nz = adiag[i] - adiag[i+1]-1;
554 
555           /* decrement the offset */
556           offset -= (nz+1);
557 
558           /* first, set the diagonal elements */
559           AjUp[offset] = (PetscInt) i;
560           AAUp[offset] = (MatScalar)1./v[nz];
561           AiUp[i]      = AiUp[i+1] - (nz+1);
562 
563           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
564           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
565         }
566 
567         /* allocate space for the triangular factor information */
568         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
569         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
570 
571         /* Create the matrix description */
572         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
573         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
574        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576        #else
577         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578        #endif
579         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
580         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
581 
582         /* set the operation */
583         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584 
585         /* set the matrix */
586         upTriFactor->csrMat = new CsrMatrix;
587         upTriFactor->csrMat->num_rows = n;
588         upTriFactor->csrMat->num_cols = n;
589         upTriFactor->csrMat->num_entries = nzUpper;
590 
591         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593 
594         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596 
597         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599 
600         /* Create the solve analysis information */
601         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
603       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610       #endif
611 
612         /* perform the solve analysis */
613         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616                                  upTriFactor->csrMat->column_indices->data().get(),
617                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618                                  upTriFactor->solveInfo,
619                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620                                #else
621                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622                                #endif
623         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
625 
626         /* assign the pointer */
627         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
628         upTriFactor->AA_h = AAUp;
629         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
630         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
631         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632       } else {
633         if (!upTriFactor->AA_h) {
634           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
635         }
636         /* Fill the upper triangular matrix */
637         offset = nzUpper;
638         for (i=n-1; i>=0; i--) {
639           v  = aa + adiag[i+1] + 1;
640 
641           /* number of elements NOT on the diagonal */
642           nz = adiag[i] - adiag[i+1]-1;
643 
644           /* decrement the offset */
645           offset -= (nz+1);
646 
647           /* first, set the diagonal elements */
648           upTriFactor->AA_h[offset] = 1./v[nz];
649           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
650         }
651         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
653       }
654     } catch(char *ex) {
655       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
662 {
663   PetscErrorCode               ierr;
664   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
665   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666   IS                           isrow = a->row,iscol = a->icol;
667   PetscBool                    row_identity,col_identity;
668   PetscInt                     n = A->rmap->n;
669 
670   PetscFunctionBegin;
671   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
673   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
674 
675   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676   cusparseTriFactors->nnz=a->nz;
677 
678   A->offloadmask = PETSC_OFFLOAD_BOTH;
679   /* lower triangular indices */
680   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
681   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682     const PetscInt *r;
683 
684     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
685     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686     cusparseTriFactors->rpermIndices->assign(r, r+n);
687     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
688     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
689   }
690 
691   /* upper triangular indices */
692   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
693   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694     const PetscInt *c;
695 
696     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
697     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698     cusparseTriFactors->cpermIndices->assign(c, c+n);
699     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
700     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
701   }
702   PetscFunctionReturn(0);
703 }
704 
705 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706 {
707   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711   cusparseStatus_t                  stat;
712   PetscErrorCode                    ierr;
713   cudaError_t                       cerr;
714   PetscInt                          *AiUp, *AjUp;
715   PetscScalar                       *AAUp;
716   PetscScalar                       *AALo;
717   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720   const MatScalar                   *aa = b->a,*v;
721 
722   PetscFunctionBegin;
723   if (!n) PetscFunctionReturn(0);
724   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725     try {
726       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728       if (!upTriFactor && !loTriFactor) {
729         /* Allocate Space for the upper triangular matrix */
730         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
731         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
732 
733         /* Fill the upper triangular matrix */
734         AiUp[0]=(PetscInt) 0;
735         AiUp[n]=nzUpper;
736         offset = 0;
737         for (i=0; i<n; i++) {
738           /* set the pointers */
739           v  = aa + ai[i];
740           vj = aj + ai[i];
741           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
742 
743           /* first, set the diagonal elements */
744           AjUp[offset] = (PetscInt) i;
745           AAUp[offset] = (MatScalar)1.0/v[nz];
746           AiUp[i]      = offset;
747           AALo[offset] = (MatScalar)1.0/v[nz];
748 
749           offset+=1;
750           if (nz>0) {
751             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
752             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
753             for (j=offset; j<offset+nz; j++) {
754               AAUp[j] = -AAUp[j];
755               AALo[j] = AAUp[j]/v[nz];
756             }
757             offset+=nz;
758           }
759         }
760 
761         /* allocate space for the triangular factor information */
762         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
763         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
764 
765         /* Create the matrix description */
766         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
767         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
768        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770        #else
771         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772        #endif
773         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
774         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
775 
776         /* set the matrix */
777         upTriFactor->csrMat = new CsrMatrix;
778         upTriFactor->csrMat->num_rows = A->rmap->n;
779         upTriFactor->csrMat->num_cols = A->cmap->n;
780         upTriFactor->csrMat->num_entries = a->nz;
781 
782         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
784 
785         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
787 
788         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
790 
791         /* set the operation */
792         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
793 
794         /* Create the solve analysis information */
795         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
797       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804       #endif
805 
806         /* perform the solve analysis */
807         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810                                  upTriFactor->csrMat->column_indices->data().get(),
811                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812                                  upTriFactor->solveInfo,
813                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814                                 #else
815                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816                                 #endif
817         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
819 
820         /* assign the pointer */
821         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
822 
823         /* allocate space for the triangular factor information */
824         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
825         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826 
827         /* Create the matrix description */
828         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
829         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
830        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832        #else
833         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834        #endif
835         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
836         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
837 
838         /* set the operation */
839         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
840 
841         /* set the matrix */
842         loTriFactor->csrMat = new CsrMatrix;
843         loTriFactor->csrMat->num_rows = A->rmap->n;
844         loTriFactor->csrMat->num_cols = A->cmap->n;
845         loTriFactor->csrMat->num_entries = a->nz;
846 
847         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
849 
850         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
852 
853         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
855 
856         /* Create the solve analysis information */
857         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
859       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866       #endif
867 
868         /* perform the solve analysis */
869         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872                                  loTriFactor->csrMat->column_indices->data().get(),
873                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874                                  loTriFactor->solveInfo,
875                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876                                 #else
877                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878                                 #endif
879         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881 
882         /* assign the pointer */
883         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
884 
885         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
886         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
887         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888       } else {
889         /* Fill the upper triangular matrix */
890         offset = 0;
891         for (i=0; i<n; i++) {
892           /* set the pointers */
893           v  = aa + ai[i];
894           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
895 
896           /* first, set the diagonal elements */
897           AAUp[offset] = 1.0/v[nz];
898           AALo[offset] = 1.0/v[nz];
899 
900           offset+=1;
901           if (nz>0) {
902             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
903             for (j=offset; j<offset+nz; j++) {
904               AAUp[j] = -AAUp[j];
905               AALo[j] = AAUp[j]/v[nz];
906             }
907             offset+=nz;
908           }
909         }
910         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
915       }
916       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
917       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918     } catch(char *ex) {
919       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920     }
921   }
922   PetscFunctionReturn(0);
923 }
924 
925 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
926 {
927   PetscErrorCode               ierr;
928   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930   IS                           ip = a->row;
931   PetscBool                    perm_identity;
932   PetscInt                     n = A->rmap->n;
933 
934   PetscFunctionBegin;
935   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
937   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
939 
940   A->offloadmask = PETSC_OFFLOAD_BOTH;
941 
942   /* lower triangular indices */
943   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
944   if (!perm_identity) {
945     IS             iip;
946     const PetscInt *irip,*rip;
947 
948     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
949     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
950     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
951     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
954     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
955     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
956     ierr = ISDestroy(&iip);CHKERRQ(ierr);
957     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
958     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
959   }
960   PetscFunctionReturn(0);
961 }
962 
963 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964 {
965   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966   IS             ip = b->row;
967   PetscBool      perm_identity;
968   PetscErrorCode ierr;
969 
970   PetscFunctionBegin;
971   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
972   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
973   B->offloadmask = PETSC_OFFLOAD_CPU;
974   /* determine which version of MatSolve needs to be used. */
975   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
976   if (perm_identity) {
977     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
979     B->ops->matsolve = NULL;
980     B->ops->matsolvetranspose = NULL;
981   } else {
982     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
984     B->ops->matsolve = NULL;
985     B->ops->matsolvetranspose = NULL;
986   }
987 
988   /* get the triangular factors */
989   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
990   PetscFunctionReturn(0);
991 }
992 
993 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994 {
995   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000   cusparseStatus_t                  stat;
1001   cusparseIndexBase_t               indexBase;
1002   cusparseMatrixType_t              matrixType;
1003   cusparseFillMode_t                fillMode;
1004   cusparseDiagType_t                diagType;
1005   cudaError_t                       cerr;
1006   PetscErrorCode                    ierr;
1007 
1008   PetscFunctionBegin;
1009   /* allocate space for the transpose of the lower triangular factor */
1010   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1011   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1012 
1013   /* set the matrix descriptors of the lower triangular factor */
1014   matrixType = cusparseGetMatType(loTriFactor->descr);
1015   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1019 
1020   /* Create the matrix description */
1021   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1023   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1026 
1027   /* set the operation */
1028   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1029 
1030   /* allocate GPU space for the CSC of the lower triangular factor*/
1031   loTriFactorT->csrMat = new CsrMatrix;
1032   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1038 
1039   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043                                        loTriFactor->csrMat->values->data().get(),
1044                                        loTriFactor->csrMat->row_offsets->data().get(),
1045                                        loTriFactor->csrMat->column_indices->data().get(),
1046                                        loTriFactorT->csrMat->values->data().get(),
1047                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1050   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051 #endif
1052 
1053   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1054   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056                           loTriFactor->csrMat->values->data().get(),
1057                           loTriFactor->csrMat->row_offsets->data().get(),
1058                           loTriFactor->csrMat->column_indices->data().get(),
1059                           loTriFactorT->csrMat->values->data().get(),
1060                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064                         #else
1065                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067                         #endif
1068   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1070 
1071   /* Create the solve analysis information */
1072   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1073   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1074 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081 #endif
1082 
1083   /* perform the solve analysis */
1084   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087                            loTriFactorT->csrMat->column_indices->data().get(),
1088                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089                            loTriFactorT->solveInfo,
1090                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091                           #else
1092                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093                           #endif
1094   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1096 
1097   /* assign the pointer */
1098   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1099 
1100   /*********************************************/
1101   /* Now the Transpose of the Upper Tri Factor */
1102   /*********************************************/
1103 
1104   /* allocate space for the transpose of the upper triangular factor */
1105   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1106   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1107 
1108   /* set the matrix descriptors of the upper triangular factor */
1109   matrixType = cusparseGetMatType(upTriFactor->descr);
1110   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1114 
1115   /* Create the matrix description */
1116   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1121 
1122   /* set the operation */
1123   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1124 
1125   /* allocate GPU space for the CSC of the upper triangular factor*/
1126   upTriFactorT->csrMat = new CsrMatrix;
1127   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1133 
1134   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138                                 upTriFactor->csrMat->values->data().get(),
1139                                 upTriFactor->csrMat->row_offsets->data().get(),
1140                                 upTriFactor->csrMat->column_indices->data().get(),
1141                                 upTriFactorT->csrMat->values->data().get(),
1142                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146 #endif
1147 
1148   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151                           upTriFactor->csrMat->values->data().get(),
1152                           upTriFactor->csrMat->row_offsets->data().get(),
1153                           upTriFactor->csrMat->column_indices->data().get(),
1154                           upTriFactorT->csrMat->values->data().get(),
1155                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159                         #else
1160                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162                         #endif
1163 
1164   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1166 
1167   /* Create the solve analysis information */
1168   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1169   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1170   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177   #endif
1178 
1179   /* perform the solve analysis */
1180   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183                            upTriFactorT->csrMat->column_indices->data().get(),
1184                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185                            upTriFactorT->solveInfo,
1186                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187                           #else
1188                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189                           #endif
1190 
1191   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193 
1194   /* assign the pointer */
1195   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196   PetscFunctionReturn(0);
1197 }
1198 
1199 struct PetscScalarToPetscInt
1200 {
1201   __host__ __device__
1202   PetscInt operator()(PetscScalar s)
1203   {
1204     return (PetscInt)PetscRealPart(s);
1205   }
1206 };
1207 
1208 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1209 {
1210   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213   cusparseStatus_t             stat;
1214   cusparseIndexBase_t          indexBase;
1215   cudaError_t                  err;
1216   PetscErrorCode               ierr;
1217 
1218   PetscFunctionBegin;
1219   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1220   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1221   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1222   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1223   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1224   if (A->transupdated) PetscFunctionReturn(0);
1225   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1226   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1227   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229   }
1230   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1232     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1234     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1235     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236 
1237     /* set alpha and beta */
1238     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1240     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244 
1245     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246       CsrMatrix *matrixT = new CsrMatrix;
1247       matstructT->mat = matrixT;
1248       matrixT->num_rows = A->cmap->n;
1249       matrixT->num_cols = A->rmap->n;
1250       matrixT->num_entries = a->nz;
1251       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253       matrixT->values = new THRUSTARRAY(a->nz);
1254 
1255       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1256       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257 
1258      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1260         stat = cusparseCreateCsr(&matstructT->matDescr,
1261                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263                                matrixT->values->data().get(),
1264                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266       #else
1267         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
1268            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
1269 
1270            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
1271            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
1272            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
1273         */
1274         if (matrixT->num_entries) {
1275           stat = cusparseCreateCsr(&matstructT->matDescr,
1276                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278                                  matrixT->values->data().get(),
1279                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
1280                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1281 
1282         } else {
1283           matstructT->matDescr = NULL;
1284           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1285         }
1286       #endif
1287      #endif
1288     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1289    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1291    #else
1292       CsrMatrix *temp  = new CsrMatrix;
1293       CsrMatrix *tempT = new CsrMatrix;
1294       /* First convert HYB to CSR */
1295       temp->num_rows = A->rmap->n;
1296       temp->num_cols = A->cmap->n;
1297       temp->num_entries = a->nz;
1298       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1300       temp->values = new THRUSTARRAY(a->nz);
1301 
1302       stat = cusparse_hyb2csr(cusparsestruct->handle,
1303                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1304                               temp->values->data().get(),
1305                               temp->row_offsets->data().get(),
1306                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1307 
1308       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1309       tempT->num_rows = A->rmap->n;
1310       tempT->num_cols = A->cmap->n;
1311       tempT->num_entries = a->nz;
1312       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1313       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1314       tempT->values = new THRUSTARRAY(a->nz);
1315 
1316       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1317                               temp->num_cols, temp->num_entries,
1318                               temp->values->data().get(),
1319                               temp->row_offsets->data().get(),
1320                               temp->column_indices->data().get(),
1321                               tempT->values->data().get(),
1322                               tempT->column_indices->data().get(),
1323                               tempT->row_offsets->data().get(),
1324                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1325 
1326       /* Last, convert CSC to HYB */
1327       cusparseHybMat_t hybMat;
1328       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1329       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1330         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1331       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1332                               matstructT->descr, tempT->values->data().get(),
1333                               tempT->row_offsets->data().get(),
1334                               tempT->column_indices->data().get(),
1335                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1336 
1337       /* assign the pointer */
1338       matstructT->mat = hybMat;
1339       A->transupdated = PETSC_TRUE;
1340       /* delete temporaries */
1341       if (tempT) {
1342         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1343         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1344         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1345         delete (CsrMatrix*) tempT;
1346       }
1347       if (temp) {
1348         if (temp->values) delete (THRUSTARRAY*) temp->values;
1349         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1350         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1351         delete (CsrMatrix*) temp;
1352       }
1353      #endif
1354     }
1355   }
1356   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1357     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1358     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1359     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1360     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1361     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1362     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1363     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1364     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1365     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1366     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1367     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1368       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1369       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1370       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1371     }
1372     if (!cusparsestruct->csr2csc_i) {
1373       THRUSTARRAY csr2csc_a(matrix->num_entries);
1374       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1375 
1376       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1377      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378       void   *csr2cscBuffer;
1379       size_t csr2cscBufferSize;
1380       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1381                                            A->cmap->n, matrix->num_entries,
1382                                            matrix->values->data().get(),
1383                                            cusparsestruct->rowoffsets_gpu->data().get(),
1384                                            matrix->column_indices->data().get(),
1385                                            matrixT->values->data().get(),
1386                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1388                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1389       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1390      #endif
1391 
1392       if (matrix->num_entries) {
1393         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1394            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1395            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1396 
1397            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1398            should be filled with indexBase. So I just take a shortcut here.
1399         */
1400         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1401                               A->cmap->n,matrix->num_entries,
1402                               csr2csc_a.data().get(),
1403                               cusparsestruct->rowoffsets_gpu->data().get(),
1404                               matrix->column_indices->data().get(),
1405                               matrixT->values->data().get(),
1406                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1408                               CUSPARSE_ACTION_NUMERIC,indexBase,
1409                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1410                              #else
1411                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1412                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1413                              #endif
1414       } else {
1415         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1416       }
1417 
1418       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1419       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1420      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1421       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1422      #endif
1423     }
1424     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1425                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1426                                                      matrixT->values->begin()));
1427   }
1428   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1429   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1430   /* the compressed row indices is not used for matTranspose */
1431   matstructT->cprowIndices = NULL;
1432   /* assign the pointer */
1433   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1434   A->transupdated = PETSC_TRUE;
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1439 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1440 {
1441   PetscInt                              n = xx->map->n;
1442   const PetscScalar                     *barray;
1443   PetscScalar                           *xarray;
1444   thrust::device_ptr<const PetscScalar> bGPU;
1445   thrust::device_ptr<PetscScalar>       xGPU;
1446   cusparseStatus_t                      stat;
1447   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1448   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1449   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1450   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1451   PetscErrorCode                        ierr;
1452 
1453   PetscFunctionBegin;
1454   /* Analyze the matrix and create the transpose ... on the fly */
1455   if (!loTriFactorT && !upTriFactorT) {
1456     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1457     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459   }
1460 
1461   /* Get the GPU pointers */
1462   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1463   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1464   xGPU = thrust::device_pointer_cast(xarray);
1465   bGPU = thrust::device_pointer_cast(barray);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, reorder with the row permutation */
1469   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1470                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1471                xGPU);
1472 
1473   /* First, solve U */
1474   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1475                         upTriFactorT->csrMat->num_rows,
1476                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477                         upTriFactorT->csrMat->num_entries,
1478                       #endif
1479                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1480                         upTriFactorT->csrMat->values->data().get(),
1481                         upTriFactorT->csrMat->row_offsets->data().get(),
1482                         upTriFactorT->csrMat->column_indices->data().get(),
1483                         upTriFactorT->solveInfo,
1484                         xarray,
1485                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486                         tempGPU->data().get(),
1487                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1488                       #else
1489                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1490                       #endif
1491 
1492   /* Then, solve L */
1493   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1494                         loTriFactorT->csrMat->num_rows,
1495                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1496                         loTriFactorT->csrMat->num_entries,
1497                       #endif
1498                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1499                         loTriFactorT->csrMat->values->data().get(),
1500                         loTriFactorT->csrMat->row_offsets->data().get(),
1501                         loTriFactorT->csrMat->column_indices->data().get(),
1502                         loTriFactorT->solveInfo,
1503                         tempGPU->data().get(),
1504                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505                         xarray,
1506                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1507                       #else
1508                          xarray);CHKERRCUSPARSE(stat);
1509                       #endif
1510 
1511   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1512   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1513                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1514                tempGPU->begin());
1515 
1516   /* Copy the temporary to the full solution. */
1517   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1518 
1519   /* restore */
1520   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1523   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1524   PetscFunctionReturn(0);
1525 }
1526 
1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1528 {
1529   const PetscScalar                 *barray;
1530   PetscScalar                       *xarray;
1531   cusparseStatus_t                  stat;
1532   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1533   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1534   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1535   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1536   PetscErrorCode                    ierr;
1537 
1538   PetscFunctionBegin;
1539   /* Analyze the matrix and create the transpose ... on the fly */
1540   if (!loTriFactorT && !upTriFactorT) {
1541     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1542     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1543     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1544   }
1545 
1546   /* Get the GPU pointers */
1547   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1548   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1549 
1550   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1551   /* First, solve U */
1552   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1553                         upTriFactorT->csrMat->num_rows,
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         upTriFactorT->csrMat->num_entries,
1556                       #endif
1557                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1558                         upTriFactorT->csrMat->values->data().get(),
1559                         upTriFactorT->csrMat->row_offsets->data().get(),
1560                         upTriFactorT->csrMat->column_indices->data().get(),
1561                         upTriFactorT->solveInfo,
1562                         barray,
1563                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564                         tempGPU->data().get(),
1565                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566                       #else
1567                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1568                       #endif
1569 
1570   /* Then, solve L */
1571   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1572                         loTriFactorT->csrMat->num_rows,
1573                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1574                         loTriFactorT->csrMat->num_entries,
1575                       #endif
1576                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1577                         loTriFactorT->csrMat->values->data().get(),
1578                         loTriFactorT->csrMat->row_offsets->data().get(),
1579                         loTriFactorT->csrMat->column_indices->data().get(),
1580                         loTriFactorT->solveInfo,
1581                         tempGPU->data().get(),
1582                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1583                         xarray,
1584                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1585                       #else
1586                         xarray);CHKERRCUSPARSE(stat);
1587                       #endif
1588 
1589   /* restore */
1590   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1591   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1592   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1593   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1594   PetscFunctionReturn(0);
1595 }
1596 
1597 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1598 {
1599   const PetscScalar                     *barray;
1600   PetscScalar                           *xarray;
1601   thrust::device_ptr<const PetscScalar> bGPU;
1602   thrust::device_ptr<PetscScalar>       xGPU;
1603   cusparseStatus_t                      stat;
1604   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1605   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1606   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1607   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1608   PetscErrorCode                        ierr;
1609 
1610   PetscFunctionBegin;
1611 
1612   /* Get the GPU pointers */
1613   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1614   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1615   xGPU = thrust::device_pointer_cast(xarray);
1616   bGPU = thrust::device_pointer_cast(barray);
1617 
1618   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1619   /* First, reorder with the row permutation */
1620   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1621                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1622                tempGPU->begin());
1623 
1624   /* Next, solve L */
1625   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1626                         loTriFactor->csrMat->num_rows,
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         loTriFactor->csrMat->num_entries,
1629                       #endif
1630                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1631                         loTriFactor->csrMat->values->data().get(),
1632                         loTriFactor->csrMat->row_offsets->data().get(),
1633                         loTriFactor->csrMat->column_indices->data().get(),
1634                         loTriFactor->solveInfo,
1635                         tempGPU->data().get(),
1636                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637                          xarray,
1638                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1639                       #else
1640                          xarray);CHKERRCUSPARSE(stat);
1641                       #endif
1642 
1643   /* Then, solve U */
1644   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1645                         upTriFactor->csrMat->num_rows,
1646                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647                         upTriFactor->csrMat->num_entries,
1648                       #endif
1649                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1650                         upTriFactor->csrMat->values->data().get(),
1651                         upTriFactor->csrMat->row_offsets->data().get(),
1652                         upTriFactor->csrMat->column_indices->data().get(),
1653                         upTriFactor->solveInfo,xarray,
1654                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655                         tempGPU->data().get(),
1656                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1657                       #else
1658                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1659                       #endif
1660 
1661   /* Last, reorder with the column permutation */
1662   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1663                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1664                xGPU);
1665 
1666   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1667   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1668   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1669   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1670   PetscFunctionReturn(0);
1671 }
1672 
1673 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1674 {
1675   const PetscScalar                 *barray;
1676   PetscScalar                       *xarray;
1677   cusparseStatus_t                  stat;
1678   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1679   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1680   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1681   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1682   PetscErrorCode                    ierr;
1683 
1684   PetscFunctionBegin;
1685   /* Get the GPU pointers */
1686   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1687   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1688 
1689   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1690   /* First, solve L */
1691   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1692                         loTriFactor->csrMat->num_rows,
1693                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694                         loTriFactor->csrMat->num_entries,
1695                       #endif
1696                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1697                         loTriFactor->csrMat->values->data().get(),
1698                         loTriFactor->csrMat->row_offsets->data().get(),
1699                         loTriFactor->csrMat->column_indices->data().get(),
1700                         loTriFactor->solveInfo,
1701                         barray,
1702                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1703                         tempGPU->data().get(),
1704                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1705                       #else
1706                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1707                       #endif
1708 
1709   /* Next, solve U */
1710   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1711                         upTriFactor->csrMat->num_rows,
1712                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713                         upTriFactor->csrMat->num_entries,
1714                       #endif
1715                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1716                         upTriFactor->csrMat->values->data().get(),
1717                         upTriFactor->csrMat->row_offsets->data().get(),
1718                         upTriFactor->csrMat->column_indices->data().get(),
1719                         upTriFactor->solveInfo,
1720                         tempGPU->data().get(),
1721                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1722                         xarray,
1723                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1724                       #else
1725                         xarray);CHKERRCUSPARSE(stat);
1726                       #endif
1727 
1728   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1732   PetscFunctionReturn(0);
1733 }
1734 
1735 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1736 {
1737   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1738   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1739   cudaError_t        cerr;
1740   PetscErrorCode     ierr;
1741 
1742   PetscFunctionBegin;
1743   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1744     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1745 
1746     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1747     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1748     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1749     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1750     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1751     A->offloadmask = PETSC_OFFLOAD_BOTH;
1752   }
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1757 {
1758   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1759   PetscErrorCode ierr;
1760 
1761   PetscFunctionBegin;
1762   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1763   *array = a->a;
1764   A->offloadmask = PETSC_OFFLOAD_CPU;
1765   PetscFunctionReturn(0);
1766 }
1767 
1768 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1769 {
1770   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1771   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1772   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1773   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1774   PetscErrorCode               ierr;
1775   cusparseStatus_t             stat;
1776   PetscBool                    both = PETSC_TRUE;
1777   cudaError_t                  err;
1778 
1779   PetscFunctionBegin;
1780   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1781   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1782     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1783       CsrMatrix *matrix;
1784       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1785 
1786       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1787       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1788       matrix->values->assign(a->a, a->a+a->nz);
1789       err  = WaitForCUDA();CHKERRCUDA(err);
1790       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1791       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1792       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1793     } else {
1794       PetscInt nnz;
1795       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1796       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1797       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1798       delete cusparsestruct->workVector;
1799       delete cusparsestruct->rowoffsets_gpu;
1800       cusparsestruct->workVector = NULL;
1801       cusparsestruct->rowoffsets_gpu = NULL;
1802       try {
1803         if (a->compressedrow.use) {
1804           m    = a->compressedrow.nrows;
1805           ii   = a->compressedrow.i;
1806           ridx = a->compressedrow.rindex;
1807         } else {
1808           m    = A->rmap->n;
1809           ii   = a->i;
1810           ridx = NULL;
1811         }
1812         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1813         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1814         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1815         else nnz = a->nz;
1816 
1817         /* create cusparse matrix */
1818         cusparsestruct->nrows = m;
1819         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1820         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1821         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1822         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1823 
1824         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1825         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1826         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1827         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1828         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1829         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1830         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1831 
1832         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1833         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1834           /* set the matrix */
1835           CsrMatrix *mat= new CsrMatrix;
1836           mat->num_rows = m;
1837           mat->num_cols = A->cmap->n;
1838           mat->num_entries = nnz;
1839           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1840           mat->row_offsets->assign(ii, ii + m+1);
1841 
1842           mat->column_indices = new THRUSTINTARRAY32(nnz);
1843           mat->column_indices->assign(a->j, a->j+nnz);
1844 
1845           mat->values = new THRUSTARRAY(nnz);
1846           if (a->a) mat->values->assign(a->a, a->a+nnz);
1847 
1848           /* assign the pointer */
1849           matstruct->mat = mat;
1850          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1851           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1852             stat = cusparseCreateCsr(&matstruct->matDescr,
1853                                     mat->num_rows, mat->num_cols, mat->num_entries,
1854                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1855                                     mat->values->data().get(),
1856                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858           }
1859          #endif
1860         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1861          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1863          #else
1864           CsrMatrix *mat= new CsrMatrix;
1865           mat->num_rows = m;
1866           mat->num_cols = A->cmap->n;
1867           mat->num_entries = nnz;
1868           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1869           mat->row_offsets->assign(ii, ii + m+1);
1870 
1871           mat->column_indices = new THRUSTINTARRAY32(nnz);
1872           mat->column_indices->assign(a->j, a->j+nnz);
1873 
1874           mat->values = new THRUSTARRAY(nnz);
1875           if (a->a) mat->values->assign(a->a, a->a+nnz);
1876 
1877           cusparseHybMat_t hybMat;
1878           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1879           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1880             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1881           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1882               matstruct->descr, mat->values->data().get(),
1883               mat->row_offsets->data().get(),
1884               mat->column_indices->data().get(),
1885               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1886           /* assign the pointer */
1887           matstruct->mat = hybMat;
1888 
1889           if (mat) {
1890             if (mat->values) delete (THRUSTARRAY*)mat->values;
1891             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1892             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1893             delete (CsrMatrix*)mat;
1894           }
1895          #endif
1896         }
1897 
1898         /* assign the compressed row indices */
1899         if (a->compressedrow.use) {
1900           cusparsestruct->workVector = new THRUSTARRAY(m);
1901           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1902           matstruct->cprowIndices->assign(ridx,ridx+m);
1903           tmp = m;
1904         } else {
1905           cusparsestruct->workVector = NULL;
1906           matstruct->cprowIndices    = NULL;
1907           tmp = 0;
1908         }
1909         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1910 
1911         /* assign the pointer */
1912         cusparsestruct->mat = matstruct;
1913       } catch(char *ex) {
1914         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1915       }
1916       err  = WaitForCUDA();CHKERRCUDA(err);
1917       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1918       cusparsestruct->nonzerostate = A->nonzerostate;
1919     }
1920     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1921   }
1922   PetscFunctionReturn(0);
1923 }
1924 
1925 struct VecCUDAPlusEquals
1926 {
1927   template <typename Tuple>
1928   __host__ __device__
1929   void operator()(Tuple t)
1930   {
1931     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1932   }
1933 };
1934 
1935 struct VecCUDAEquals
1936 {
1937   template <typename Tuple>
1938   __host__ __device__
1939   void operator()(Tuple t)
1940   {
1941     thrust::get<1>(t) = thrust::get<0>(t);
1942   }
1943 };
1944 
1945 struct VecCUDAEqualsReverse
1946 {
1947   template <typename Tuple>
1948   __host__ __device__
1949   void operator()(Tuple t)
1950   {
1951     thrust::get<0>(t) = thrust::get<1>(t);
1952   }
1953 };
1954 
1955 struct MatMatCusparse {
1956   PetscBool             cisdense;
1957   PetscScalar           *Bt;
1958   Mat                   X;
1959   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1960   PetscLogDouble        flops;
1961   CsrMatrix             *Bcsr;
1962 
1963 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964   cusparseSpMatDescr_t  matSpBDescr;
1965   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1966   cusparseDnMatDescr_t  matBDescr;
1967   cusparseDnMatDescr_t  matCDescr;
1968   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1969  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1970   void                  *dBuffer4;
1971   void                  *dBuffer5;
1972  #endif
1973   size_t                mmBufferSize;
1974   void                  *mmBuffer;
1975   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1976   cusparseSpGEMMDescr_t spgemmDesc;
1977 #endif
1978 };
1979 
1980 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1981 {
1982   PetscErrorCode   ierr;
1983   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1984   cudaError_t      cerr;
1985  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1986   cusparseStatus_t stat;
1987  #endif
1988 
1989   PetscFunctionBegin;
1990   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1991   delete mmdata->Bcsr;
1992  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1993   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1994   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1995   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1996   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1997  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1998   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1999   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2000  #endif
2001   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2002   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2003  #endif
2004   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2005   ierr = PetscFree(data);CHKERRQ(ierr);
2006   PetscFunctionReturn(0);
2007 }
2008 
2009 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2010 
2011 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2012 {
2013   Mat_Product                  *product = C->product;
2014   Mat                          A,B;
2015   PetscInt                     m,n,blda,clda;
2016   PetscBool                    flg,biscuda;
2017   Mat_SeqAIJCUSPARSE           *cusp;
2018   cusparseStatus_t             stat;
2019   cusparseOperation_t          opA;
2020   const PetscScalar            *barray;
2021   PetscScalar                  *carray;
2022   PetscErrorCode               ierr;
2023   MatMatCusparse               *mmdata;
2024   Mat_SeqAIJCUSPARSEMultStruct *mat;
2025   CsrMatrix                    *csrmat;
2026 
2027   PetscFunctionBegin;
2028   MatCheckProduct(C,1);
2029   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2030   mmdata = (MatMatCusparse*)product->data;
2031   A    = product->A;
2032   B    = product->B;
2033   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2034   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2035   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2036      Instead of silently accepting the wrong answer, I prefer to raise the error */
2037   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2038   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2039   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2040   switch (product->type) {
2041   case MATPRODUCT_AB:
2042   case MATPRODUCT_PtAP:
2043     mat = cusp->mat;
2044     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2045     m   = A->rmap->n;
2046     n   = B->cmap->n;
2047     break;
2048   case MATPRODUCT_AtB:
2049     if (!A->form_explicit_transpose) {
2050       mat = cusp->mat;
2051       opA = CUSPARSE_OPERATION_TRANSPOSE;
2052     } else {
2053       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2054       mat  = cusp->matTranspose;
2055       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056     }
2057     m = A->cmap->n;
2058     n = B->cmap->n;
2059     break;
2060   case MATPRODUCT_ABt:
2061   case MATPRODUCT_RARt:
2062     mat = cusp->mat;
2063     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2064     m   = A->rmap->n;
2065     n   = B->rmap->n;
2066     break;
2067   default:
2068     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2069   }
2070   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2071   csrmat = (CsrMatrix*)mat->mat;
2072   /* if the user passed a CPU matrix, copy the data to the GPU */
2073   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2074   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2075   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2076 
2077   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2078   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2079     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2080     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2081   } else {
2082     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2083     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2084   }
2085 
2086   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2087  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2089   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2090   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2091     size_t mmBufferSize;
2092     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2093     if (!mmdata->matBDescr) {
2094       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2095       mmdata->Blda = blda;
2096     }
2097 
2098     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2099     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2100       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2101       mmdata->Clda = clda;
2102     }
2103 
2104     if (!mat->matDescr) {
2105       stat = cusparseCreateCsr(&mat->matDescr,
2106                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2107                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2108                                csrmat->values->data().get(),
2109                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2110                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2111     }
2112     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2113                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2114                                    mmdata->matCDescr,cusparse_scalartype,
2115                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2116     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2117       cudaError_t cerr;
2118       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2119       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2120       mmdata->mmBufferSize = mmBufferSize;
2121     }
2122     mmdata->initialized = PETSC_TRUE;
2123   } else {
2124     /* to be safe, always update pointers of the mats */
2125     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2126     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2127     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2128   }
2129 
2130   /* do cusparseSpMM, which supports transpose on B */
2131   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2132                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2133                       mmdata->matCDescr,cusparse_scalartype,
2134                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2135  #else
2136   PetscInt k;
2137   /* cusparseXcsrmm does not support transpose on B */
2138   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2139     cublasHandle_t cublasv2handle;
2140     cublasStatus_t cerr;
2141 
2142     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2143     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2144                        B->cmap->n,B->rmap->n,
2145                        &PETSC_CUSPARSE_ONE ,barray,blda,
2146                        &PETSC_CUSPARSE_ZERO,barray,blda,
2147                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2148     blda = B->cmap->n;
2149     k    = B->cmap->n;
2150   } else {
2151     k    = B->rmap->n;
2152   }
2153 
2154   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2155   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2156                            csrmat->num_entries,mat->alpha_one,mat->descr,
2157                            csrmat->values->data().get(),
2158                            csrmat->row_offsets->data().get(),
2159                            csrmat->column_indices->data().get(),
2160                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2161                            carray,clda);CHKERRCUSPARSE(stat);
2162  #endif
2163   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2164   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2165   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2166   if (product->type == MATPRODUCT_RARt) {
2167     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2168     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2169   } else if (product->type == MATPRODUCT_PtAP) {
2170     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2171     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2172   } else {
2173     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2174   }
2175   if (mmdata->cisdense) {
2176     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2177   }
2178   if (!biscuda) {
2179     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2180   }
2181   PetscFunctionReturn(0);
2182 }
2183 
2184 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2185 {
2186   Mat_Product        *product = C->product;
2187   Mat                A,B;
2188   PetscInt           m,n;
2189   PetscBool          cisdense,flg;
2190   PetscErrorCode     ierr;
2191   MatMatCusparse     *mmdata;
2192   Mat_SeqAIJCUSPARSE *cusp;
2193 
2194   PetscFunctionBegin;
2195   MatCheckProduct(C,1);
2196   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2197   A    = product->A;
2198   B    = product->B;
2199   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2200   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2201   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2202   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2203   switch (product->type) {
2204   case MATPRODUCT_AB:
2205     m = A->rmap->n;
2206     n = B->cmap->n;
2207     break;
2208   case MATPRODUCT_AtB:
2209     m = A->cmap->n;
2210     n = B->cmap->n;
2211     break;
2212   case MATPRODUCT_ABt:
2213     m = A->rmap->n;
2214     n = B->rmap->n;
2215     break;
2216   case MATPRODUCT_PtAP:
2217     m = B->cmap->n;
2218     n = B->cmap->n;
2219     break;
2220   case MATPRODUCT_RARt:
2221     m = B->rmap->n;
2222     n = B->rmap->n;
2223     break;
2224   default:
2225     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2226   }
2227   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2228   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2229   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2230   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2231 
2232   /* product data */
2233   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2234   mmdata->cisdense = cisdense;
2235  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2236   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2237   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2238     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2239   }
2240  #endif
2241   /* for these products we need intermediate storage */
2242   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2243     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2244     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2245     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2246       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2247     } else {
2248       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2249     }
2250   }
2251   C->product->data    = mmdata;
2252   C->product->destroy = MatDestroy_MatMatCusparse;
2253 
2254   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2255   PetscFunctionReturn(0);
2256 }
2257 
2258 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2259 {
2260   Mat_Product                  *product = C->product;
2261   Mat                          A,B;
2262   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2263   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2264   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2265   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2266   PetscBool                    flg;
2267   PetscErrorCode               ierr;
2268   cusparseStatus_t             stat;
2269   cudaError_t                  cerr;
2270   MatProductType               ptype;
2271   MatMatCusparse               *mmdata;
2272 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273   cusparseSpMatDescr_t         BmatSpDescr;
2274 #endif
2275   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2276 
2277   PetscFunctionBegin;
2278   MatCheckProduct(C,1);
2279   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2280   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2281   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2282   mmdata = (MatMatCusparse*)C->product->data;
2283   A = product->A;
2284   B = product->B;
2285   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2286     mmdata->reusesym = PETSC_FALSE;
2287     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2289     Cmat = Ccusp->mat;
2290     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2291     Ccsr = (CsrMatrix*)Cmat->mat;
2292     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2293     goto finalize;
2294   }
2295   if (!c->nz) goto finalize;
2296   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2297   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2299   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2300   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2301   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2302   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2303   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2304   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2305   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2306   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2307   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2308   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2309   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2310 
2311   ptype = product->type;
2312   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2313     ptype = MATPRODUCT_AB;
2314     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2315   }
2316   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2317     ptype = MATPRODUCT_AB;
2318     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2319   }
2320   switch (ptype) {
2321   case MATPRODUCT_AB:
2322     Amat = Acusp->mat;
2323     Bmat = Bcusp->mat;
2324     break;
2325   case MATPRODUCT_AtB:
2326     Amat = Acusp->matTranspose;
2327     Bmat = Bcusp->mat;
2328     break;
2329   case MATPRODUCT_ABt:
2330     Amat = Acusp->mat;
2331     Bmat = Bcusp->matTranspose;
2332     break;
2333   default:
2334     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2335   }
2336   Cmat = Ccusp->mat;
2337   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2338   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2339   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2340   Acsr = (CsrMatrix*)Amat->mat;
2341   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2342   Ccsr = (CsrMatrix*)Cmat->mat;
2343   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2344   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2345   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2346   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2347 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2348   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2349   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2350   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2351     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2352                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2353                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2354                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2355   #else
2356     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2357                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2358                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2359                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2360     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2361                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2362                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2363   #endif
2364 #else
2365   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2366                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2367                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2368                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2369                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2370 #endif
2371   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2372   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2373   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2374   C->offloadmask = PETSC_OFFLOAD_GPU;
2375 finalize:
2376   /* shorter version of MatAssemblyEnd_SeqAIJ */
2377   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2378   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2379   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2380   c->reallocs         = 0;
2381   C->info.mallocs    += 0;
2382   C->info.nz_unneeded = 0;
2383   C->assembled = C->was_assembled = PETSC_TRUE;
2384   C->num_ass++;
2385   PetscFunctionReturn(0);
2386 }
2387 
2388 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2389 {
2390   Mat_Product                  *product = C->product;
2391   Mat                          A,B;
2392   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2393   Mat_SeqAIJ                   *a,*b,*c;
2394   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2395   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2396   PetscInt                     i,j,m,n,k;
2397   PetscBool                    flg;
2398   PetscErrorCode               ierr;
2399   cusparseStatus_t             stat;
2400   cudaError_t                  cerr;
2401   MatProductType               ptype;
2402   MatMatCusparse               *mmdata;
2403   PetscLogDouble               flops;
2404   PetscBool                    biscompressed,ciscompressed;
2405 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2406   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2407   cusparseSpMatDescr_t         BmatSpDescr;
2408 #else
2409   int                          cnz;
2410 #endif
2411   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2412 
2413   PetscFunctionBegin;
2414   MatCheckProduct(C,1);
2415   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2416   A    = product->A;
2417   B    = product->B;
2418   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2419   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2420   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2421   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2422   a = (Mat_SeqAIJ*)A->data;
2423   b = (Mat_SeqAIJ*)B->data;
2424   /* product data */
2425   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2426   C->product->data    = mmdata;
2427   C->product->destroy = MatDestroy_MatMatCusparse;
2428 
2429   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2430   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2431   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2432   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2433   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2434   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2435 
2436   ptype = product->type;
2437   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2438     ptype = MATPRODUCT_AB;
2439     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2440   }
2441   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2442     ptype = MATPRODUCT_AB;
2443     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2444   }
2445   biscompressed = PETSC_FALSE;
2446   ciscompressed = PETSC_FALSE;
2447   switch (ptype) {
2448   case MATPRODUCT_AB:
2449     m = A->rmap->n;
2450     n = B->cmap->n;
2451     k = A->cmap->n;
2452     Amat = Acusp->mat;
2453     Bmat = Bcusp->mat;
2454     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2455     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2456     break;
2457   case MATPRODUCT_AtB:
2458     m = A->cmap->n;
2459     n = B->cmap->n;
2460     k = A->rmap->n;
2461     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2462     Amat = Acusp->matTranspose;
2463     Bmat = Bcusp->mat;
2464     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2465     break;
2466   case MATPRODUCT_ABt:
2467     m = A->rmap->n;
2468     n = B->rmap->n;
2469     k = A->cmap->n;
2470     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2471     Amat = Acusp->mat;
2472     Bmat = Bcusp->matTranspose;
2473     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2474     break;
2475   default:
2476     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2477   }
2478 
2479   /* create cusparse matrix */
2480   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2481   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2482   c     = (Mat_SeqAIJ*)C->data;
2483   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2484   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2485   Ccsr  = new CsrMatrix;
2486 
2487   c->compressedrow.use = ciscompressed;
2488   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2489     c->compressedrow.nrows = a->compressedrow.nrows;
2490     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2491     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2492     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2493     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2494     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2495   } else {
2496     c->compressedrow.nrows  = 0;
2497     c->compressedrow.i      = NULL;
2498     c->compressedrow.rindex = NULL;
2499     Ccusp->workVector       = NULL;
2500     Cmat->cprowIndices      = NULL;
2501   }
2502   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2503   Ccusp->mat      = Cmat;
2504   Ccusp->mat->mat = Ccsr;
2505   Ccsr->num_rows    = Ccusp->nrows;
2506   Ccsr->num_cols    = n;
2507   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2508   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2509   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2510   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2511   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2512   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2513   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2514   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2515   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2516   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2517   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2518     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2519     c->nz = 0;
2520     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2521     Ccsr->values = new THRUSTARRAY(c->nz);
2522     goto finalizesym;
2523   }
2524 
2525   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2526   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2527   Acsr = (CsrMatrix*)Amat->mat;
2528   if (!biscompressed) {
2529     Bcsr = (CsrMatrix*)Bmat->mat;
2530 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2531     BmatSpDescr = Bmat->matDescr;
2532 #endif
2533   } else { /* we need to use row offsets for the full matrix */
2534     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2535     Bcsr = new CsrMatrix;
2536     Bcsr->num_rows       = B->rmap->n;
2537     Bcsr->num_cols       = cBcsr->num_cols;
2538     Bcsr->num_entries    = cBcsr->num_entries;
2539     Bcsr->column_indices = cBcsr->column_indices;
2540     Bcsr->values         = cBcsr->values;
2541     if (!Bcusp->rowoffsets_gpu) {
2542       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2543       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2544       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2545     }
2546     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2547     mmdata->Bcsr = Bcsr;
2548 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549     if (Bcsr->num_rows && Bcsr->num_cols) {
2550       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2551                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2552                                Bcsr->values->data().get(),
2553                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2554                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2555     }
2556     BmatSpDescr = mmdata->matSpBDescr;
2557 #endif
2558   }
2559   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2560   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2561   /* precompute flops count */
2562   if (ptype == MATPRODUCT_AB) {
2563     for (i=0, flops = 0; i<A->rmap->n; i++) {
2564       const PetscInt st = a->i[i];
2565       const PetscInt en = a->i[i+1];
2566       for (j=st; j<en; j++) {
2567         const PetscInt brow = a->j[j];
2568         flops += 2.*(b->i[brow+1] - b->i[brow]);
2569       }
2570     }
2571   } else if (ptype == MATPRODUCT_AtB) {
2572     for (i=0, flops = 0; i<A->rmap->n; i++) {
2573       const PetscInt anzi = a->i[i+1] - a->i[i];
2574       const PetscInt bnzi = b->i[i+1] - b->i[i];
2575       flops += (2.*anzi)*bnzi;
2576     }
2577   } else { /* TODO */
2578     flops = 0.;
2579   }
2580 
2581   mmdata->flops = flops;
2582   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2583 
2584 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2585   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2586   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2587                           NULL, NULL, NULL,
2588                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2589                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2590   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2591  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2592  {
2593   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2594      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2595   */
2596   void*  dBuffer1 = NULL;
2597   void*  dBuffer2 = NULL;
2598   void*  dBuffer3 = NULL;
2599   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2600   size_t bufferSize1 = 0;
2601   size_t bufferSize2 = 0;
2602   size_t bufferSize3 = 0;
2603   size_t bufferSize4 = 0;
2604   size_t bufferSize5 = 0;
2605 
2606   /*----------------------------------------------------------------------*/
2607   /* ask bufferSize1 bytes for external memory */
2608   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2609                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2610                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2611   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2612   /* inspect the matrices A and B to understand the memory requirement for the next step */
2613   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2616 
2617   /*----------------------------------------------------------------------*/
2618   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2619                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2620                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2621   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2622   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2623   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2624   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2625                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2626                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2627   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2628   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2629 
2630   /*----------------------------------------------------------------------*/
2631   /* get matrix C non-zero entries C_nnz1 */
2632   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2633   c->nz = (PetscInt) C_nnz1;
2634   /* allocate matrix C */
2635   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2636   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2637   /* update matC with the new pointers */
2638   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2639                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2640 
2641   /*----------------------------------------------------------------------*/
2642   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2645   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2646   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2647                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2648                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2649   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2650   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2651                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2652                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2653                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2654   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2655  }
2656  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2657   size_t bufSize2;
2658   /* ask bufferSize bytes for external memory */
2659   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2660                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2662                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2663   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2664   /* inspect the matrices A and B to understand the memory requirement for the next step */
2665   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2666                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2667                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2668                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2669   /* ask bufferSize again bytes for external memory */
2670   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2671                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2672                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2673                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2674   /* The CUSPARSE documentation is not clear, nor the API
2675      We need both buffers to perform the operations properly!
2676      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2677      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2678      is stored in the descriptor! What a messy API... */
2679   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2680   /* compute the intermediate product of A * B */
2681   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2682                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2685   /* get matrix C non-zero entries C_nnz1 */
2686   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2687   c->nz = (PetscInt) C_nnz1;
2688   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2689   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2690   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2691   Ccsr->values = new THRUSTARRAY(c->nz);
2692   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2693   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2694                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2695   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2696                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2697                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2698  #endif
2699 #else
2700   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2701   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2702                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2703                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2704                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2705                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2706   c->nz = cnz;
2707   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2708   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2709   Ccsr->values = new THRUSTARRAY(c->nz);
2710   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2711 
2712   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2713   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2714      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2715      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2716   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2717                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2718                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2719                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2720                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2721 #endif
2722   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2723   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2724 finalizesym:
2725   c->singlemalloc = PETSC_FALSE;
2726   c->free_a       = PETSC_TRUE;
2727   c->free_ij      = PETSC_TRUE;
2728   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2729   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2730   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2731     PetscInt *d_i = c->i;
2732     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2733     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2734     ii   = *Ccsr->row_offsets;
2735     jj   = *Ccsr->column_indices;
2736     if (ciscompressed) d_i = c->compressedrow.i;
2737     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2738     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2739   } else {
2740     PetscInt *d_i = c->i;
2741     if (ciscompressed) d_i = c->compressedrow.i;
2742     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2743     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2744   }
2745   if (ciscompressed) { /* need to expand host row offsets */
2746     PetscInt r = 0;
2747     c->i[0] = 0;
2748     for (k = 0; k < c->compressedrow.nrows; k++) {
2749       const PetscInt next = c->compressedrow.rindex[k];
2750       const PetscInt old = c->compressedrow.i[k];
2751       for (; r < next; r++) c->i[r+1] = old;
2752     }
2753     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2754   }
2755   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2756   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2757   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2758   c->maxnz = c->nz;
2759   c->nonzerorowcnt = 0;
2760   c->rmax = 0;
2761   for (k = 0; k < m; k++) {
2762     const PetscInt nn = c->i[k+1] - c->i[k];
2763     c->ilen[k] = c->imax[k] = nn;
2764     c->nonzerorowcnt += (PetscInt)!!nn;
2765     c->rmax = PetscMax(c->rmax,nn);
2766   }
2767   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2768   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2769   Ccsr->num_entries = c->nz;
2770 
2771   C->nonzerostate++;
2772   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2773   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2774   Ccusp->nonzerostate = C->nonzerostate;
2775   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2776   C->preallocated  = PETSC_TRUE;
2777   C->assembled     = PETSC_FALSE;
2778   C->was_assembled = PETSC_FALSE;
2779   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2780     mmdata->reusesym = PETSC_TRUE;
2781     C->offloadmask   = PETSC_OFFLOAD_GPU;
2782   }
2783   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2784   PetscFunctionReturn(0);
2785 }
2786 
2787 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2788 
2789 /* handles sparse or dense B */
2790 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2791 {
2792   Mat_Product    *product = mat->product;
2793   PetscErrorCode ierr;
2794   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2795 
2796   PetscFunctionBegin;
2797   MatCheckProduct(mat,1);
2798   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2799   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2800     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2801   }
2802   if (product->type == MATPRODUCT_ABC) {
2803     Ciscusp = PETSC_FALSE;
2804     if (!product->C->boundtocpu) {
2805       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2806     }
2807   }
2808   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
2809     PetscBool usecpu = PETSC_FALSE;
2810     switch (product->type) {
2811     case MATPRODUCT_AB:
2812       if (product->api_user) {
2813         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
2814         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2815         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2816       } else {
2817         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
2818         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2819         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2820       }
2821       break;
2822     case MATPRODUCT_AtB:
2823       if (product->api_user) {
2824         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
2825         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2826         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2827       } else {
2828         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
2829         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2830         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2831       }
2832       break;
2833     case MATPRODUCT_PtAP:
2834       if (product->api_user) {
2835         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
2836         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2837         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2838       } else {
2839         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
2840         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2841         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2842       }
2843       break;
2844     case MATPRODUCT_RARt:
2845       if (product->api_user) {
2846         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
2847         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2848         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2849       } else {
2850         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
2851         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2852         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2853       }
2854       break;
2855     case MATPRODUCT_ABC:
2856       if (product->api_user) {
2857         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
2858         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2859         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2860       } else {
2861         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
2862         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
2863         ierr = PetscOptionsEnd();CHKERRQ(ierr);
2864       }
2865       break;
2866     default:
2867       break;
2868     }
2869     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
2870   }
2871   /* dispatch */
2872   if (isdense) {
2873     switch (product->type) {
2874     case MATPRODUCT_AB:
2875     case MATPRODUCT_AtB:
2876     case MATPRODUCT_ABt:
2877     case MATPRODUCT_PtAP:
2878     case MATPRODUCT_RARt:
2879      if (product->A->boundtocpu) {
2880         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2881       } else {
2882         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2883       }
2884       break;
2885     case MATPRODUCT_ABC:
2886       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2887       break;
2888     default:
2889       break;
2890     }
2891   } else if (Biscusp && Ciscusp) {
2892     switch (product->type) {
2893     case MATPRODUCT_AB:
2894     case MATPRODUCT_AtB:
2895     case MATPRODUCT_ABt:
2896       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2897       break;
2898     case MATPRODUCT_PtAP:
2899     case MATPRODUCT_RARt:
2900     case MATPRODUCT_ABC:
2901       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2902       break;
2903     default:
2904       break;
2905     }
2906   } else { /* fallback for AIJ */
2907     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2908   }
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2913 {
2914   PetscErrorCode ierr;
2915 
2916   PetscFunctionBegin;
2917   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2918   PetscFunctionReturn(0);
2919 }
2920 
2921 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2922 {
2923   PetscErrorCode ierr;
2924 
2925   PetscFunctionBegin;
2926   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2927   PetscFunctionReturn(0);
2928 }
2929 
2930 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2931 {
2932   PetscErrorCode ierr;
2933 
2934   PetscFunctionBegin;
2935   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2936   PetscFunctionReturn(0);
2937 }
2938 
2939 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940 {
2941   PetscErrorCode ierr;
2942 
2943   PetscFunctionBegin;
2944   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2945   PetscFunctionReturn(0);
2946 }
2947 
2948 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2949 {
2950   PetscErrorCode ierr;
2951 
2952   PetscFunctionBegin;
2953   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2954   PetscFunctionReturn(0);
2955 }
2956 
2957 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2958 {
2959   int i = blockIdx.x*blockDim.x + threadIdx.x;
2960   if (i < n) y[idx[i]] += x[i];
2961 }
2962 
2963 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2964 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2965 {
2966   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2967   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2968   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2969   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2970   PetscErrorCode               ierr;
2971   cusparseStatus_t             stat;
2972   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2973   PetscBool                    compressed;
2974 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2975   PetscInt                     nx,ny;
2976 #endif
2977 
2978   PetscFunctionBegin;
2979   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2980   if (!a->nonzerorowcnt) {
2981     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2982     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2983     PetscFunctionReturn(0);
2984   }
2985   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2986   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2987   if (!trans) {
2988     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2989     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2990   } else {
2991     if (herm || !A->form_explicit_transpose) {
2992       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2993       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2994     } else {
2995       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
2996       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2997     }
2998   }
2999   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3000   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3001 
3002   try {
3003     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3004     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3005     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3006 
3007     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3008     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3009       /* z = A x + beta y.
3010          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3011          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3012       */
3013       xptr = xarray;
3014       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3015       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3016      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3017       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3018           allocated to accommodate different uses. So we get the length info directly from mat.
3019        */
3020       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3021         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3022         nx = mat->num_cols;
3023         ny = mat->num_rows;
3024       }
3025      #endif
3026     } else {
3027       /* z = A^T x + beta y
3028          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3029          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3030        */
3031       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3032       dptr = zarray;
3033       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3034       if (compressed) { /* Scatter x to work vector */
3035         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3036         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3037                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3038                          VecCUDAEqualsReverse());
3039       }
3040      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3041       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043         nx = mat->num_rows;
3044         ny = mat->num_cols;
3045       }
3046      #endif
3047     }
3048 
3049     /* csr_spmv does y = alpha op(A) x + beta y */
3050     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3051      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3052       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3053       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3054         cudaError_t cerr;
3055         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3056         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3057         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3058                                 matstruct->matDescr,
3059                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3060                                 matstruct->cuSpMV[opA].vecYDescr,
3061                                 cusparse_scalartype,
3062                                 cusparsestruct->spmvAlg,
3063                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3064         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3065 
3066         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3067       } else {
3068         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3069         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3070         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3071       }
3072 
3073       stat = cusparseSpMV(cusparsestruct->handle, opA,
3074                                matstruct->alpha_one,
3075                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3076                                matstruct->cuSpMV[opA].vecXDescr,
3077                                beta,
3078                                matstruct->cuSpMV[opA].vecYDescr,
3079                                cusparse_scalartype,
3080                                cusparsestruct->spmvAlg,
3081                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3082      #else
3083       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3084       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3085                                mat->num_rows, mat->num_cols,
3086                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3087                                mat->values->data().get(), mat->row_offsets->data().get(),
3088                                mat->column_indices->data().get(), xptr, beta,
3089                                dptr);CHKERRCUSPARSE(stat);
3090      #endif
3091     } else {
3092       if (cusparsestruct->nrows) {
3093        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3095        #else
3096         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3097         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3098                                  matstruct->alpha_one, matstruct->descr, hybMat,
3099                                  xptr, beta,
3100                                  dptr);CHKERRCUSPARSE(stat);
3101        #endif
3102       }
3103     }
3104     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3105 
3106     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3108         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3109           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3110         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3111           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3112         }
3113       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3114         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
3115       }
3116 
3117       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3118       if (compressed) {
3119         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3120         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3121            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3122            prevent that. So I just add a ScatterAdd kernel.
3123          */
3124        #if 0
3125         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3126         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3127                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3128                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3129                          VecCUDAPlusEquals());
3130        #else
3131         PetscInt n = matstruct->cprowIndices->size();
3132         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3133        #endif
3134         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3135       }
3136     } else {
3137       if (yy && yy != zz) {
3138         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3139       }
3140     }
3141     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3142     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3143     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
3144   } catch(char *ex) {
3145     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
3146   }
3147   if (yy) {
3148     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3149   } else {
3150     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3151   }
3152   PetscFunctionReturn(0);
3153 }
3154 
3155 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3156 {
3157   PetscErrorCode ierr;
3158 
3159   PetscFunctionBegin;
3160   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3161   PetscFunctionReturn(0);
3162 }
3163 
3164 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
3165 {
3166   PetscErrorCode     ierr;
3167   PetscObjectState   onnz = A->nonzerostate;
3168   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3169 
3170   PetscFunctionBegin;
3171   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3172   if (onnz != A->nonzerostate && cusp->deviceMat) {
3173     cudaError_t cerr;
3174 
3175     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3176     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3177     cusp->deviceMat = NULL;
3178   }
3179   PetscFunctionReturn(0);
3180 }
3181 
3182 /* --------------------------------------------------------------------------------*/
3183 /*@
3184    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3185    (the default parallel PETSc format). This matrix will ultimately pushed down
3186    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3187    assembly performance the user should preallocate the matrix storage by setting
3188    the parameter nz (or the array nnz).  By setting these parameters accurately,
3189    performance during matrix assembly can be increased by more than a factor of 50.
3190 
3191    Collective
3192 
3193    Input Parameters:
3194 +  comm - MPI communicator, set to PETSC_COMM_SELF
3195 .  m - number of rows
3196 .  n - number of columns
3197 .  nz - number of nonzeros per row (same for all rows)
3198 -  nnz - array containing the number of nonzeros in the various rows
3199          (possibly different for each row) or NULL
3200 
3201    Output Parameter:
3202 .  A - the matrix
3203 
3204    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3205    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3206    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3207 
3208    Notes:
3209    If nnz is given then nz is ignored
3210 
3211    The AIJ format (also called the Yale sparse matrix format or
3212    compressed row storage), is fully compatible with standard Fortran 77
3213    storage.  That is, the stored row and column indices can begin at
3214    either one (as in Fortran) or zero.  See the users' manual for details.
3215 
3216    Specify the preallocated storage with either nz or nnz (not both).
3217    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3218    allocation.  For large problems you MUST preallocate memory or you
3219    will get TERRIBLE performance, see the users' manual chapter on matrices.
3220 
3221    By default, this format uses inodes (identical nodes) when possible, to
3222    improve numerical efficiency of matrix-vector products and solves. We
3223    search for consecutive rows with the same nonzero structure, thereby
3224    reusing matrix information to achieve increased efficiency.
3225 
3226    Level: intermediate
3227 
3228 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3229 @*/
3230 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3231 {
3232   PetscErrorCode ierr;
3233 
3234   PetscFunctionBegin;
3235   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3236   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3237   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3238   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3239   PetscFunctionReturn(0);
3240 }
3241 
3242 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3243 {
3244   PetscErrorCode ierr;
3245 
3246   PetscFunctionBegin;
3247   if (A->factortype == MAT_FACTOR_NONE) {
3248     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3249   } else {
3250     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3251   }
3252   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3253   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3254   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3255   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3256   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3257   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3258   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3259   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3260   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
3261   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3262   PetscFunctionReturn(0);
3263 }
3264 
3265 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3266 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3267 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3268 {
3269   PetscErrorCode ierr;
3270 
3271   PetscFunctionBegin;
3272   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3273   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3274   PetscFunctionReturn(0);
3275 }
3276 
3277 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3278 {
3279   PetscErrorCode     ierr;
3280   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3281   Mat_SeqAIJCUSPARSE *cy;
3282   Mat_SeqAIJCUSPARSE *cx;
3283   PetscScalar        *ay;
3284   const PetscScalar  *ax;
3285   CsrMatrix          *csry,*csrx;
3286 
3287   PetscFunctionBegin;
3288   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3289   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3290   if (X->ops->axpy != Y->ops->axpy) {
3291     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3292     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3293     PetscFunctionReturn(0);
3294   }
3295   /* if we are here, it means both matrices are bound to GPU */
3296   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3297   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3298   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3299   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3300   csry = (CsrMatrix*)cy->mat->mat;
3301   csrx = (CsrMatrix*)cx->mat->mat;
3302   /* see if we can turn this into a cublas axpy */
3303   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3304     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3305     if (eq) {
3306       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3307     }
3308     if (eq) str = SAME_NONZERO_PATTERN;
3309   }
3310   /* spgeam is buggy with one column */
3311   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3312 
3313   if (str == SUBSET_NONZERO_PATTERN) {
3314     cusparseStatus_t stat;
3315     PetscScalar      b = 1.0;
3316 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3317     size_t           bufferSize;
3318     void             *buffer;
3319     cudaError_t      cerr;
3320 #endif
3321 
3322     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3323     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3324     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3325 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3327                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3328                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3329                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3330     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3331     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3332     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3333                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3334                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3335                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3336     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3337     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3338     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3339 #else
3340     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3341     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3342                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3343                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3344                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3345     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3346     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3347 #endif
3348     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3349     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3350     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3351     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3352   } else if (str == SAME_NONZERO_PATTERN) {
3353     cublasHandle_t cublasv2handle;
3354     cublasStatus_t berr;
3355     PetscBLASInt   one = 1, bnz = 1;
3356 
3357     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3358     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3359     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3360     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3361     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3362     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3363     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3364     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3365     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3366     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3367     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3368   } else {
3369     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3370     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3371   }
3372   PetscFunctionReturn(0);
3373 }
3374 
3375 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3376 {
3377   PetscErrorCode ierr;
3378   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3379   PetscScalar    *ay;
3380   cublasHandle_t cublasv2handle;
3381   cublasStatus_t berr;
3382   PetscBLASInt   one = 1, bnz = 1;
3383 
3384   PetscFunctionBegin;
3385   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3386   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3387   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3388   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3389   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3390   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3391   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3392   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3393   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3394   PetscFunctionReturn(0);
3395 }
3396 
3397 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3398 {
3399   PetscErrorCode ierr;
3400   PetscBool      both = PETSC_FALSE;
3401   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3402 
3403   PetscFunctionBegin;
3404   if (A->factortype == MAT_FACTOR_NONE) {
3405     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3406     if (spptr->mat) {
3407       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3408       if (matrix->values) {
3409         both = PETSC_TRUE;
3410         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3411       }
3412     }
3413     if (spptr->matTranspose) {
3414       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3415       if (matrix->values) {
3416         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3417       }
3418     }
3419   }
3420   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3421   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3422   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3423   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424   else A->offloadmask = PETSC_OFFLOAD_CPU;
3425   PetscFunctionReturn(0);
3426 }
3427 
3428 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429 {
3430   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3431   PetscErrorCode ierr;
3432 
3433   PetscFunctionBegin;
3434   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3435   if (flg) {
3436     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3437 
3438     A->ops->scale                     = MatScale_SeqAIJ;
3439     A->ops->axpy                      = MatAXPY_SeqAIJ;
3440     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3441     A->ops->mult                      = MatMult_SeqAIJ;
3442     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3443     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3444     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3445     A->ops->multhermitiantranspose    = NULL;
3446     A->ops->multhermitiantransposeadd = NULL;
3447     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3448     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3449     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3450     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3451     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3452     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3453     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3454     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3455   } else {
3456     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3457     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3458     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3459     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3460     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3461     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3462     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3463     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3464     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3465     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3466     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3467     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3468     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3469     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3470     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3471     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3472     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3473   }
3474   A->boundtocpu = flg;
3475   a->inode.use = flg;
3476   PetscFunctionReturn(0);
3477 }
3478 
3479 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3480 {
3481   PetscErrorCode   ierr;
3482   cusparseStatus_t stat;
3483   Mat              B;
3484 
3485   PetscFunctionBegin;
3486   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3487   if (reuse == MAT_INITIAL_MATRIX) {
3488     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3489   } else if (reuse == MAT_REUSE_MATRIX) {
3490     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3491   }
3492   B = *newmat;
3493 
3494   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3495   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3496 
3497   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3498     if (B->factortype == MAT_FACTOR_NONE) {
3499       Mat_SeqAIJCUSPARSE *spptr;
3500       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3501       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3502       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3503       spptr->format     = MAT_CUSPARSE_CSR;
3504      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3505      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3506       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3507      #else
3508       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3509      #endif
3510       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3511       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3512      #endif
3513       B->spptr = spptr;
3514     } else {
3515       Mat_SeqAIJCUSPARSETriFactors *spptr;
3516 
3517       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3518       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3519       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3520       B->spptr = spptr;
3521     }
3522     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3523   }
3524   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3525   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3526   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3527   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3528   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3529   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3530 
3531   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3532   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3533   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3534 #if defined(PETSC_HAVE_HYPRE)
3535   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3536 #endif
3537   PetscFunctionReturn(0);
3538 }
3539 
3540 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3541 {
3542   PetscErrorCode ierr;
3543 
3544   PetscFunctionBegin;
3545   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3546   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3547   PetscFunctionReturn(0);
3548 }
3549 
3550 /*MC
3551    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3552 
3553    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3554    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3555    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3556 
3557    Options Database Keys:
3558 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3559 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3560 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3561 
3562   Level: beginner
3563 
3564 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3565 M*/
3566 
3567 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3568 
3569 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3570 {
3571   PetscErrorCode ierr;
3572 
3573   PetscFunctionBegin;
3574   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3575   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3576   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3577   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3578   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3579 
3580   PetscFunctionReturn(0);
3581 }
3582 
3583 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3584 {
3585   PetscErrorCode   ierr;
3586   cusparseStatus_t stat;
3587 
3588   PetscFunctionBegin;
3589   if (*cusparsestruct) {
3590     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3591     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3592     delete (*cusparsestruct)->workVector;
3593     delete (*cusparsestruct)->rowoffsets_gpu;
3594     delete (*cusparsestruct)->cooPerm;
3595     delete (*cusparsestruct)->cooPerm_a;
3596     delete (*cusparsestruct)->csr2csc_i;
3597     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3598     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3599   }
3600   PetscFunctionReturn(0);
3601 }
3602 
3603 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3604 {
3605   PetscFunctionBegin;
3606   if (*mat) {
3607     delete (*mat)->values;
3608     delete (*mat)->column_indices;
3609     delete (*mat)->row_offsets;
3610     delete *mat;
3611     *mat = 0;
3612   }
3613   PetscFunctionReturn(0);
3614 }
3615 
3616 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3617 {
3618   cusparseStatus_t stat;
3619   PetscErrorCode   ierr;
3620 
3621   PetscFunctionBegin;
3622   if (*trifactor) {
3623     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3624     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3625     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3626     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3627     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3628    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3629     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3630    #endif
3631     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3632   }
3633   PetscFunctionReturn(0);
3634 }
3635 
3636 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3637 {
3638   CsrMatrix        *mat;
3639   cusparseStatus_t stat;
3640   cudaError_t      err;
3641 
3642   PetscFunctionBegin;
3643   if (*matstruct) {
3644     if ((*matstruct)->mat) {
3645       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3646        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3647         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3648        #else
3649         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3650         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3651        #endif
3652       } else {
3653         mat = (CsrMatrix*)(*matstruct)->mat;
3654         CsrMatrix_Destroy(&mat);
3655       }
3656     }
3657     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3658     delete (*matstruct)->cprowIndices;
3659     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3660     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3661     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3662 
3663    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3665     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3666     for (int i=0; i<3; i++) {
3667       if (mdata->cuSpMV[i].initialized) {
3668         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3669         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3670         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3671       }
3672     }
3673    #endif
3674     delete *matstruct;
3675     *matstruct = NULL;
3676   }
3677   PetscFunctionReturn(0);
3678 }
3679 
3680 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3681 {
3682   PetscErrorCode ierr;
3683 
3684   PetscFunctionBegin;
3685   if (*trifactors) {
3686     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3687     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3688     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3689     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3690     delete (*trifactors)->rpermIndices;
3691     delete (*trifactors)->cpermIndices;
3692     delete (*trifactors)->workVector;
3693     (*trifactors)->rpermIndices = NULL;
3694     (*trifactors)->cpermIndices = NULL;
3695     (*trifactors)->workVector = NULL;
3696     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3697     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3698     (*trifactors)->init_dev_prop = PETSC_FALSE;
3699   }
3700   PetscFunctionReturn(0);
3701 }
3702 
3703 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3704 {
3705   PetscErrorCode   ierr;
3706   cusparseHandle_t handle;
3707   cusparseStatus_t stat;
3708 
3709   PetscFunctionBegin;
3710   if (*trifactors) {
3711     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3712     if (handle = (*trifactors)->handle) {
3713       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3714     }
3715     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3716   }
3717   PetscFunctionReturn(0);
3718 }
3719 
3720 struct IJCompare
3721 {
3722   __host__ __device__
3723   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3724   {
3725     if (t1.get<0>() < t2.get<0>()) return true;
3726     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3727     return false;
3728   }
3729 };
3730 
3731 struct IJEqual
3732 {
3733   __host__ __device__
3734   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3735   {
3736     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3737     return true;
3738   }
3739 };
3740 
3741 struct IJDiff
3742 {
3743   __host__ __device__
3744   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3745   {
3746     return t1 == t2 ? 0 : 1;
3747   }
3748 };
3749 
3750 struct IJSum
3751 {
3752   __host__ __device__
3753   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3754   {
3755     return t1||t2;
3756   }
3757 };
3758 
3759 #include <thrust/iterator/discard_iterator.h>
3760 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3761 {
3762   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3763   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3764   THRUSTARRAY                           *cooPerm_v = NULL;
3765   thrust::device_ptr<const PetscScalar> d_v;
3766   CsrMatrix                             *matrix;
3767   PetscErrorCode                        ierr;
3768   PetscInt                              n;
3769 
3770   PetscFunctionBegin;
3771   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3772   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3773   if (!cusp->cooPerm) {
3774     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3775     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3776     PetscFunctionReturn(0);
3777   }
3778   matrix = (CsrMatrix*)cusp->mat->mat;
3779   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780   if (!v) {
3781     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3782     goto finalize;
3783   }
3784   n = cusp->cooPerm->size();
3785   if (isCudaMem(v)) {
3786     d_v = thrust::device_pointer_cast(v);
3787   } else {
3788     cooPerm_v = new THRUSTARRAY(n);
3789     cooPerm_v->assign(v,v+n);
3790     d_v = cooPerm_v->data();
3791     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3792   }
3793   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3794   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3795     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3796       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3797       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3798       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3799         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3800         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3801       */
3802       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3803       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3804       delete cooPerm_w;
3805     } else {
3806       /* all nonzeros in d_v[] are unique entries */
3807       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3808                                                                 matrix->values->begin()));
3809       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3810                                                                 matrix->values->end()));
3811       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
3812     }
3813   } else {
3814     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3815       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3816       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3817     } else {
3818       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3819                                                                 matrix->values->begin()));
3820       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3821                                                                 matrix->values->end()));
3822       thrust::for_each(zibit,zieit,VecCUDAEquals());
3823     }
3824   }
3825   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3826 finalize:
3827   delete cooPerm_v;
3828   A->offloadmask = PETSC_OFFLOAD_GPU;
3829   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3830   /* shorter version of MatAssemblyEnd_SeqAIJ */
3831   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3832   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3833   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3834   a->reallocs         = 0;
3835   A->info.mallocs    += 0;
3836   A->info.nz_unneeded = 0;
3837   A->assembled = A->was_assembled = PETSC_TRUE;
3838   A->num_ass++;
3839   PetscFunctionReturn(0);
3840 }
3841 
3842 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3843 {
3844   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845   PetscErrorCode     ierr;
3846 
3847   PetscFunctionBegin;
3848   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3849   if (!cusp) PetscFunctionReturn(0);
3850   if (destroy) {
3851     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3852     delete cusp->csr2csc_i;
3853     cusp->csr2csc_i = NULL;
3854   }
3855   A->transupdated = PETSC_FALSE;
3856   PetscFunctionReturn(0);
3857 }
3858 
3859 #include <thrust/binary_search.h>
3860 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3861 {
3862   PetscErrorCode     ierr;
3863   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3864   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3865   PetscInt           cooPerm_n, nzr = 0;
3866   cudaError_t        cerr;
3867 
3868   PetscFunctionBegin;
3869   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3870   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3871   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3872   if (n != cooPerm_n) {
3873     delete cusp->cooPerm;
3874     delete cusp->cooPerm_a;
3875     cusp->cooPerm = NULL;
3876     cusp->cooPerm_a = NULL;
3877   }
3878   if (n) {
3879     THRUSTINTARRAY d_i(n);
3880     THRUSTINTARRAY d_j(n);
3881     THRUSTINTARRAY ii(A->rmap->n);
3882 
3883     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3884     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3885 
3886     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3887     d_i.assign(coo_i,coo_i+n);
3888     d_j.assign(coo_j,coo_j+n);
3889 
3890     /* Ex.
3891       n = 6
3892       coo_i = [3,3,1,4,1,4]
3893       coo_j = [3,2,2,5,2,6]
3894     */
3895     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3896     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3897 
3898     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3899     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3900     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3901     *cusp->cooPerm_a = d_i; /* copy the sorted array */
3902     THRUSTINTARRAY w = d_j;
3903 
3904     /*
3905       d_i     = [1,1,3,3,4,4]
3906       d_j     = [2,2,2,3,5,6]
3907       cooPerm = [2,4,1,0,3,5]
3908     */
3909     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3910 
3911     /*
3912       d_i     = [1,3,3,4,4,x]
3913                             ^ekey
3914       d_j     = [2,2,3,5,6,x]
3915                            ^nekye
3916     */
3917     if (nekey == ekey) { /* all entries are unique */
3918       delete cusp->cooPerm_a;
3919       cusp->cooPerm_a = NULL;
3920     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3921       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3922       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3923       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3924       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
3925       w[0] = 0;
3926       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3927       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
3928     }
3929     thrust::counting_iterator<PetscInt> search_begin(0);
3930     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3931                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3932                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
3933     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3934 
3935     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3936     a->singlemalloc = PETSC_FALSE;
3937     a->free_a       = PETSC_TRUE;
3938     a->free_ij      = PETSC_TRUE;
3939     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3940     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
3941     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3942     a->nz = a->maxnz = a->i[A->rmap->n];
3943     a->rmax = 0;
3944     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3945     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3946     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3947     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3948     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3949     for (PetscInt i = 0; i < A->rmap->n; i++) {
3950       const PetscInt nnzr = a->i[i+1] - a->i[i];
3951       nzr += (PetscInt)!!(nnzr);
3952       a->ilen[i] = a->imax[i] = nnzr;
3953       a->rmax = PetscMax(a->rmax,nnzr);
3954     }
3955     a->nonzerorowcnt = nzr;
3956     A->preallocated = PETSC_TRUE;
3957     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3958     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3959   } else {
3960     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3961   }
3962   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3963 
3964   /* We want to allocate the CUSPARSE struct for matvec now.
3965      The code is so convoluted now that I prefer to copy zeros */
3966   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3967   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3968   A->offloadmask = PETSC_OFFLOAD_CPU;
3969   A->nonzerostate++;
3970   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3971   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3972 
3973   A->assembled = PETSC_FALSE;
3974   A->was_assembled = PETSC_FALSE;
3975   PetscFunctionReturn(0);
3976 }
3977 
3978 /*@C
3979     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
3980 
3981    Not collective
3982 
3983     Input Parameters:
3984 +   A - the matrix
3985 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
3986 
3987     Output Parameters:
3988 +   ia - the CSR row pointers
3989 -   ja - the CSR column indices
3990 
3991     Level: developer
3992 
3993     Notes:
3994       When compressed is true, the CSR structure does not contain empty rows
3995 
3996 .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
3997 @*/
3998 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3999 {
4000   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4001   CsrMatrix          *csr;
4002   PetscErrorCode     ierr;
4003   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
4004 
4005   PetscFunctionBegin;
4006   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4007   if (!i || !j) PetscFunctionReturn(0);
4008   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4009   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4010   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4011   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4012   csr = (CsrMatrix*)cusp->mat->mat;
4013   if (i) {
4014     if (!compressed && a->compressedrow.use) { /* need full row offset */
4015       if (!cusp->rowoffsets_gpu) {
4016         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4017         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4018         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4019       }
4020       *i = cusp->rowoffsets_gpu->data().get();
4021     } else *i = csr->row_offsets->data().get();
4022   }
4023   if (j) *j = csr->column_indices->data().get();
4024   PetscFunctionReturn(0);
4025 }
4026 
4027 /*@C
4028     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
4029 
4030    Not collective
4031 
4032     Input Parameters:
4033 +   A - the matrix
4034 -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
4035 
4036     Output Parameters:
4037 +   ia - the CSR row pointers
4038 -   ja - the CSR column indices
4039 
4040     Level: developer
4041 
4042 .seealso: MatSeqAIJCUSPARSEGetIJ()
4043 @*/
4044 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
4045 {
4046   PetscFunctionBegin;
4047   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4048   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4049   if (i) *i = NULL;
4050   if (j) *j = NULL;
4051   PetscFunctionReturn(0);
4052 }
4053 
4054 /*@C
4055    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4056 
4057    Not Collective
4058 
4059    Input Parameter:
4060 .   A - a MATSEQAIJCUSPARSE matrix
4061 
4062    Output Parameter:
4063 .   a - pointer to the device data
4064 
4065    Level: developer
4066 
4067    Notes: may trigger host-device copies if up-to-date matrix data is on host
4068 
4069 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
4070 @*/
4071 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4072 {
4073   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4074   CsrMatrix          *csr;
4075   PetscErrorCode     ierr;
4076 
4077   PetscFunctionBegin;
4078   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4079   PetscValidPointer(a,2);
4080   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4081   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4082   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4083   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4084   csr = (CsrMatrix*)cusp->mat->mat;
4085   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4086   *a = csr->values->data().get();
4087   PetscFunctionReturn(0);
4088 }
4089 
4090 /*@C
4091    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
4092 
4093    Not Collective
4094 
4095    Input Parameter:
4096 .   A - a MATSEQAIJCUSPARSE matrix
4097 
4098    Output Parameter:
4099 .   a - pointer to the device data
4100 
4101    Level: developer
4102 
4103 .seealso: MatSeqAIJCUSPARSEGetArrayRead()
4104 @*/
4105 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4106 {
4107   PetscFunctionBegin;
4108   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4109   PetscValidPointer(a,2);
4110   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4111   *a = NULL;
4112   PetscFunctionReturn(0);
4113 }
4114 
4115 /*@C
4116    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4117 
4118    Not Collective
4119 
4120    Input Parameter:
4121 .   A - a MATSEQAIJCUSPARSE matrix
4122 
4123    Output Parameter:
4124 .   a - pointer to the device data
4125 
4126    Level: developer
4127 
4128    Notes: may trigger host-device copies if up-to-date matrix data is on host
4129 
4130 .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
4131 @*/
4132 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4133 {
4134   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4135   CsrMatrix          *csr;
4136   PetscErrorCode     ierr;
4137 
4138   PetscFunctionBegin;
4139   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4140   PetscValidPointer(a,2);
4141   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4142   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4144   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145   csr = (CsrMatrix*)cusp->mat->mat;
4146   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4147   *a = csr->values->data().get();
4148   A->offloadmask = PETSC_OFFLOAD_GPU;
4149   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4150   PetscFunctionReturn(0);
4151 }
4152 /*@C
4153    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4154 
4155    Not Collective
4156 
4157    Input Parameter:
4158 .   A - a MATSEQAIJCUSPARSE matrix
4159 
4160    Output Parameter:
4161 .   a - pointer to the device data
4162 
4163    Level: developer
4164 
4165 .seealso: MatSeqAIJCUSPARSEGetArray()
4166 @*/
4167 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4168 {
4169   PetscErrorCode ierr;
4170 
4171   PetscFunctionBegin;
4172   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4173   PetscValidPointer(a,2);
4174   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4175   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4176   *a = NULL;
4177   PetscFunctionReturn(0);
4178 }
4179 
4180 /*@C
4181    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
4182 
4183    Not Collective
4184 
4185    Input Parameter:
4186 .   A - a MATSEQAIJCUSPARSE matrix
4187 
4188    Output Parameter:
4189 .   a - pointer to the device data
4190 
4191    Level: developer
4192 
4193    Notes: does not trigger host-device copies and flags data validity on the GPU
4194 
4195 .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
4196 @*/
4197 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4198 {
4199   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4200   CsrMatrix          *csr;
4201   PetscErrorCode     ierr;
4202 
4203   PetscFunctionBegin;
4204   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4205   PetscValidPointer(a,2);
4206   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4207   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4208   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4209   csr = (CsrMatrix*)cusp->mat->mat;
4210   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4211   *a = csr->values->data().get();
4212   A->offloadmask = PETSC_OFFLOAD_GPU;
4213   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4214   PetscFunctionReturn(0);
4215 }
4216 
4217 /*@C
4218    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
4219 
4220    Not Collective
4221 
4222    Input Parameter:
4223 .   A - a MATSEQAIJCUSPARSE matrix
4224 
4225    Output Parameter:
4226 .   a - pointer to the device data
4227 
4228    Level: developer
4229 
4230 .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
4231 @*/
4232 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4233 {
4234   PetscErrorCode ierr;
4235 
4236   PetscFunctionBegin;
4237   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4238   PetscValidPointer(a,2);
4239   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4240   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4241   *a = NULL;
4242   PetscFunctionReturn(0);
4243 }
4244 
4245 struct IJCompare4
4246 {
4247   __host__ __device__
4248   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4249   {
4250     if (t1.get<0>() < t2.get<0>()) return true;
4251     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4252     return false;
4253   }
4254 };
4255 
4256 struct Shift
4257 {
4258   int _shift;
4259 
4260   Shift(int shift) : _shift(shift) {}
4261   __host__ __device__
4262   inline int operator() (const int &c)
4263   {
4264     return c + _shift;
4265   }
4266 };
4267 
4268 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4269 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4270 {
4271   PetscErrorCode               ierr;
4272   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4273   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4274   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4275   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4276   PetscInt                     Annz,Bnnz;
4277   cusparseStatus_t             stat;
4278   PetscInt                     i,m,n,zero = 0;
4279   cudaError_t                  cerr;
4280 
4281   PetscFunctionBegin;
4282   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4283   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4284   PetscValidPointer(C,4);
4285   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4286   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4287   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4288   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4289   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4290   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4291   if (reuse == MAT_INITIAL_MATRIX) {
4292     m     = A->rmap->n;
4293     n     = A->cmap->n + B->cmap->n;
4294     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4295     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4296     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4297     c     = (Mat_SeqAIJ*)(*C)->data;
4298     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4299     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4300     Ccsr  = new CsrMatrix;
4301     Cmat->cprowIndices      = NULL;
4302     c->compressedrow.use    = PETSC_FALSE;
4303     c->compressedrow.nrows  = 0;
4304     c->compressedrow.i      = NULL;
4305     c->compressedrow.rindex = NULL;
4306     Ccusp->workVector       = NULL;
4307     Ccusp->nrows    = m;
4308     Ccusp->mat      = Cmat;
4309     Ccusp->mat->mat = Ccsr;
4310     Ccsr->num_rows  = m;
4311     Ccsr->num_cols  = n;
4312     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4313     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4314     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4315     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4316     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4317     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4318     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4319     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4320     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4321     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4322     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4323     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4324     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4325 
4326     Acsr = (CsrMatrix*)Acusp->mat->mat;
4327     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4328     Annz = (PetscInt)Acsr->column_indices->size();
4329     Bnnz = (PetscInt)Bcsr->column_indices->size();
4330     c->nz = Annz + Bnnz;
4331     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4332     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4333     Ccsr->values = new THRUSTARRAY(c->nz);
4334     Ccsr->num_entries = c->nz;
4335     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4336     if (c->nz) {
4337       auto Acoo = new THRUSTINTARRAY32(Annz);
4338       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
4339       auto Ccoo = new THRUSTINTARRAY32(c->nz);
4340       THRUSTINTARRAY32 *Aroff,*Broff;
4341 
4342       if (a->compressedrow.use) { /* need full row offset */
4343         if (!Acusp->rowoffsets_gpu) {
4344           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4345           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4346           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4347         }
4348         Aroff = Acusp->rowoffsets_gpu;
4349       } else Aroff = Acsr->row_offsets;
4350       if (b->compressedrow.use) { /* need full row offset */
4351         if (!Bcusp->rowoffsets_gpu) {
4352           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4353           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4354           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4355         }
4356         Broff = Bcusp->rowoffsets_gpu;
4357       } else Broff = Bcsr->row_offsets;
4358       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4359       stat = cusparseXcsr2coo(Acusp->handle,
4360                               Aroff->data().get(),
4361                               Annz,
4362                               m,
4363                               Acoo->data().get(),
4364                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4365       stat = cusparseXcsr2coo(Bcusp->handle,
4366                               Broff->data().get(),
4367                               Bnnz,
4368                               m,
4369                               Bcoo->data().get(),
4370                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4371       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4372       auto Aperm = thrust::make_constant_iterator(1);
4373       auto Bperm = thrust::make_constant_iterator(0);
4374 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4375       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4376       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4377 #else
4378       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4379       auto Bcib = Bcsr->column_indices->begin();
4380       auto Bcie = Bcsr->column_indices->end();
4381       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4382 #endif
4383       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4384       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4385       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4386       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4387       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4388       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4389       auto p1 = Ccusp->cooPerm->begin();
4390       auto p2 = Ccusp->cooPerm->begin();
4391       thrust::advance(p2,Annz);
4392       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4393 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4394       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4395 #endif
4396       auto cci = thrust::make_counting_iterator(zero);
4397       auto cce = thrust::make_counting_iterator(c->nz);
4398 #if 0 //Errors on SUMMIT cuda 11.1.0
4399       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4400 #else
4401       auto pred = thrust::identity<int>();
4402       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4403       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4404 #endif
4405       stat = cusparseXcoo2csr(Ccusp->handle,
4406                               Ccoo->data().get(),
4407                               c->nz,
4408                               m,
4409                               Ccsr->row_offsets->data().get(),
4410                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4411       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4412       delete wPerm;
4413       delete Acoo;
4414       delete Bcoo;
4415       delete Ccoo;
4416 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4417       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4418                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4419                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4420                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4421 #endif
4422       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4423         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
4424         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4425         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4426         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4427         CsrMatrix *CcsrT = new CsrMatrix;
4428         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4429         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4430 
4431         (*C)->form_explicit_transpose = PETSC_TRUE;
4432         (*C)->transupdated = PETSC_TRUE;
4433         Ccusp->rowoffsets_gpu = NULL;
4434         CmatT->cprowIndices = NULL;
4435         CmatT->mat = CcsrT;
4436         CcsrT->num_rows = n;
4437         CcsrT->num_cols = m;
4438         CcsrT->num_entries = c->nz;
4439 
4440         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4441         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4442         CcsrT->values = new THRUSTARRAY(c->nz);
4443 
4444         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4445         auto rT = CcsrT->row_offsets->begin();
4446         if (AT) {
4447           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4448           thrust::advance(rT,-1);
4449         }
4450         if (BT) {
4451           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4452           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4453           thrust::copy(titb,tite,rT);
4454         }
4455         auto cT = CcsrT->column_indices->begin();
4456         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4457         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4458         auto vT = CcsrT->values->begin();
4459         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4460         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4461         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4462 
4463         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4464         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4465         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4466         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4467         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4468         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4469         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4470         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4471         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4472 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4473         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4474                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4475                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4476                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4477 #endif
4478         Ccusp->matTranspose = CmatT;
4479       }
4480     }
4481 
4482     c->singlemalloc = PETSC_FALSE;
4483     c->free_a       = PETSC_TRUE;
4484     c->free_ij      = PETSC_TRUE;
4485     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4486     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4487     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4488       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4489       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4490       ii   = *Ccsr->row_offsets;
4491       jj   = *Ccsr->column_indices;
4492       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4493       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4494     } else {
4495       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4496       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4497     }
4498     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4499     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4500     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4501     c->maxnz = c->nz;
4502     c->nonzerorowcnt = 0;
4503     c->rmax = 0;
4504     for (i = 0; i < m; i++) {
4505       const PetscInt nn = c->i[i+1] - c->i[i];
4506       c->ilen[i] = c->imax[i] = nn;
4507       c->nonzerorowcnt += (PetscInt)!!nn;
4508       c->rmax = PetscMax(c->rmax,nn);
4509     }
4510     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4511     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4512     (*C)->nonzerostate++;
4513     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4514     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4515     Ccusp->nonzerostate = (*C)->nonzerostate;
4516     (*C)->preallocated  = PETSC_TRUE;
4517   } else {
4518     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4519     c = (Mat_SeqAIJ*)(*C)->data;
4520     if (c->nz) {
4521       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4522       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4523       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4524       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4525       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4526       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4527       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4528       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4529       Acsr = (CsrMatrix*)Acusp->mat->mat;
4530       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4531       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4532       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4533       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4534       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4535       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4536       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4537       auto pmid = Ccusp->cooPerm->begin();
4538       thrust::advance(pmid,Acsr->num_entries);
4539       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4540       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4541                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4542       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4543                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4544       thrust::for_each(zibait,zieait,VecCUDAEquals());
4545       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4546                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4547       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4548                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4549       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4550       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4551       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4552         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4553         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4554         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4555         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4556         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4557         auto vT = CcsrT->values->begin();
4558         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4559         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4560         (*C)->transupdated = PETSC_TRUE;
4561       }
4562       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4563     }
4564   }
4565   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4566   (*C)->assembled     = PETSC_TRUE;
4567   (*C)->was_assembled = PETSC_FALSE;
4568   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4569   PetscFunctionReturn(0);
4570 }
4571 
4572 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4573 {
4574   PetscErrorCode    ierr;
4575   bool              dmem;
4576   const PetscScalar *av;
4577   cudaError_t       cerr;
4578 
4579   PetscFunctionBegin;
4580   dmem = isCudaMem(v);
4581   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4582   if (n && idx) {
4583     THRUSTINTARRAY widx(n);
4584     widx.assign(idx,idx+n);
4585     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4586 
4587     THRUSTARRAY *w = NULL;
4588     thrust::device_ptr<PetscScalar> dv;
4589     if (dmem) {
4590       dv = thrust::device_pointer_cast(v);
4591     } else {
4592       w = new THRUSTARRAY(n);
4593       dv = w->data();
4594     }
4595     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4596 
4597     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4598     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4599     thrust::for_each(zibit,zieit,VecCUDAEquals());
4600     if (w) {
4601       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4602     }
4603     delete w;
4604   } else {
4605     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606   }
4607   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4608   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4609   PetscFunctionReturn(0);
4610 }
4611