xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 45c4b7c1946bfb8cea30c0b6b241a056476b76b5)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(),
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  loTriFactor->solveInfo,
472                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473                                #else
474                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475                                #endif
476         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
481         loTriFactor->AA_h = AALo;
482         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
483         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
484         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485       } else { /* update values only */
486         if (!loTriFactor->AA_h) {
487           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
488         }
489         /* Fill the lower triangular matrix */
490         loTriFactor->AA_h[0]  = 1.0;
491         v        = aa;
492         vi       = aj;
493         offset   = 1;
494         for (i=1; i<n; i++) {
495           nz = ai[i+1] - ai[i];
496           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497           offset      += nz;
498           loTriFactor->AA_h[offset] = 1.0;
499           offset      += 1;
500           v  += nz;
501         }
502         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504       }
505     } catch(char *ex) {
506       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
507     }
508   }
509   PetscFunctionReturn(0);
510 }
511 
512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
513 {
514   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
515   PetscInt                          n = A->rmap->n;
516   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
518   cusparseStatus_t                  stat;
519   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
520   const MatScalar                   *aa = a->a,*v;
521   PetscInt                          *AiUp, *AjUp;
522   PetscInt                          i,nz, nzUpper, offset;
523   PetscErrorCode                    ierr;
524   cudaError_t                       cerr;
525 
526   PetscFunctionBegin;
527   if (!n) PetscFunctionReturn(0);
528   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
529     try {
530       /* next, figure out the number of nonzeros in the upper triangular matrix. */
531       nzUpper = adiag[0]-adiag[n];
532       if (!upTriFactor) {
533         PetscScalar *AAUp;
534 
535         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536 
537         /* Allocate Space for the upper triangular matrix */
538         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
539         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540 
541         /* Fill the upper triangular matrix */
542         AiUp[0]=(PetscInt) 0;
543         AiUp[n]=nzUpper;
544         offset = nzUpper;
545         for (i=n-1; i>=0; i--) {
546           v  = aa + adiag[i+1] + 1;
547           vi = aj + adiag[i+1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i+1]-1;
551 
552           /* decrement the offset */
553           offset -= (nz+1);
554 
555           /* first, set the diagonal elements */
556           AjUp[offset] = (PetscInt) i;
557           AAUp[offset] = (MatScalar)1./v[nz];
558           AiUp[i]      = AiUp[i+1] - (nz+1);
559 
560           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
562         }
563 
564         /* allocate space for the triangular factor information */
565         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567 
568         /* Create the matrix description */
569         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
570         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
571        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573        #else
574         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575        #endif
576         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
577         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578 
579         /* set the operation */
580         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581 
582         /* set the matrix */
583         upTriFactor->csrMat = new CsrMatrix;
584         upTriFactor->csrMat->num_rows = n;
585         upTriFactor->csrMat->num_cols = n;
586         upTriFactor->csrMat->num_entries = nzUpper;
587 
588         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590 
591         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593 
594         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596 
597         /* Create the solve analysis information */
598         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607       #endif
608 
609         /* perform the solve analysis */
610         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613                                  upTriFactor->csrMat->column_indices->data().get(),
614                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615                                  upTriFactor->solveInfo,
616                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617                                #else
618                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619                                #endif
620         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622 
623         /* assign the pointer */
624         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625         upTriFactor->AA_h = AAUp;
626         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629       } else {
630         if (!upTriFactor->AA_h) {
631           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
632         }
633         /* Fill the upper triangular matrix */
634         offset = nzUpper;
635         for (i=n-1; i>=0; i--) {
636           v  = aa + adiag[i+1] + 1;
637 
638           /* number of elements NOT on the diagonal */
639           nz = adiag[i] - adiag[i+1]-1;
640 
641           /* decrement the offset */
642           offset -= (nz+1);
643 
644           /* first, set the diagonal elements */
645           upTriFactor->AA_h[offset] = 1./v[nz];
646           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647         }
648         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650       }
651     } catch(char *ex) {
652       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
653     }
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
659 {
660   PetscErrorCode               ierr;
661   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
662   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
663   IS                           isrow = a->row,iscol = a->icol;
664   PetscBool                    row_identity,col_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
671 
672   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673   cusparseTriFactors->nnz=a->nz;
674 
675   A->offloadmask = PETSC_OFFLOAD_BOTH;
676   /* lower triangular indices */
677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679     const PetscInt *r;
680 
681     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(r, r+n);
684     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686   }
687 
688   /* upper triangular indices */
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691     const PetscInt *c;
692 
693     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695     cusparseTriFactors->cpermIndices->assign(c, c+n);
696     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703 {
704   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   cusparseStatus_t                  stat;
709   PetscErrorCode                    ierr;
710   cudaError_t                       cerr;
711   PetscInt                          *AiUp, *AjUp;
712   PetscScalar                       *AAUp;
713   PetscScalar                       *AALo;
714   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717   const MatScalar                   *aa = b->a,*v;
718 
719   PetscFunctionBegin;
720   if (!n) PetscFunctionReturn(0);
721   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722     try {
723       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725       if (!upTriFactor && !loTriFactor) {
726         /* Allocate Space for the upper triangular matrix */
727         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
728         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729 
730         /* Fill the upper triangular matrix */
731         AiUp[0]=(PetscInt) 0;
732         AiUp[n]=nzUpper;
733         offset = 0;
734         for (i=0; i<n; i++) {
735           /* set the pointers */
736           v  = aa + ai[i];
737           vj = aj + ai[i];
738           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739 
740           /* first, set the diagonal elements */
741           AjUp[offset] = (PetscInt) i;
742           AAUp[offset] = (MatScalar)1.0/v[nz];
743           AiUp[i]      = offset;
744           AALo[offset] = (MatScalar)1.0/v[nz];
745 
746           offset+=1;
747           if (nz>0) {
748             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750             for (j=offset; j<offset+nz; j++) {
751               AAUp[j] = -AAUp[j];
752               AALo[j] = AAUp[j]/v[nz];
753             }
754             offset+=nz;
755           }
756         }
757 
758         /* allocate space for the triangular factor information */
759         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761 
762         /* Create the matrix description */
763         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
764         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
765        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767        #else
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769        #endif
770         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
771         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772 
773         /* set the matrix */
774         upTriFactor->csrMat = new CsrMatrix;
775         upTriFactor->csrMat->num_rows = A->rmap->n;
776         upTriFactor->csrMat->num_cols = A->cmap->n;
777         upTriFactor->csrMat->num_entries = a->nz;
778 
779         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781 
782         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784 
785         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787 
788         /* set the operation */
789         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790 
791         /* Create the solve analysis information */
792         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
794       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801       #endif
802 
803         /* perform the solve analysis */
804         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807                                  upTriFactor->csrMat->column_indices->data().get(),
808                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809                                  upTriFactor->solveInfo,
810                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811                                 #else
812                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813                                 #endif
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  loTriFactor->solveInfo,
872                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881 
882         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
883         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
884         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885       } else {
886         /* Fill the upper triangular matrix */
887         offset = 0;
888         for (i=0; i<n; i++) {
889           /* set the pointers */
890           v  = aa + ai[i];
891           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892 
893           /* first, set the diagonal elements */
894           AAUp[offset] = 1.0/v[nz];
895           AALo[offset] = 1.0/v[nz];
896 
897           offset+=1;
898           if (nz>0) {
899             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900             for (j=offset; j<offset+nz; j++) {
901               AAUp[j] = -AAUp[j];
902               AALo[j] = AAUp[j]/v[nz];
903             }
904             offset+=nz;
905           }
906         }
907         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912       }
913       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
914       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915     } catch(char *ex) {
916       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917     }
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
923 {
924   PetscErrorCode               ierr;
925   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927   IS                           ip = a->row;
928   PetscBool                    perm_identity;
929   PetscInt                     n = A->rmap->n;
930 
931   PetscFunctionBegin;
932   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936 
937   A->offloadmask = PETSC_OFFLOAD_BOTH;
938 
939   /* lower triangular indices */
940   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941   if (!perm_identity) {
942     IS             iip;
943     const PetscInt *irip,*rip;
944 
945     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
946     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
951     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
952     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
953     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064                         #endif
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(),
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            loTriFactorT->solveInfo,
1087                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088                           #else
1089                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090                           #endif
1091   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093 
1094   /* assign the pointer */
1095   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096 
1097   /*********************************************/
1098   /* Now the Transpose of the Upper Tri Factor */
1099   /*********************************************/
1100 
1101   /* allocate space for the transpose of the upper triangular factor */
1102   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104 
1105   /* set the matrix descriptors of the upper triangular factor */
1106   matrixType = cusparseGetMatType(upTriFactor->descr);
1107   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111 
1112   /* Create the matrix description */
1113   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118 
1119   /* set the operation */
1120   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121 
1122   /* allocate GPU space for the CSC of the upper triangular factor*/
1123   upTriFactorT->csrMat = new CsrMatrix;
1124   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130 
1131   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135                                 upTriFactor->csrMat->values->data().get(),
1136                                 upTriFactor->csrMat->row_offsets->data().get(),
1137                                 upTriFactor->csrMat->column_indices->data().get(),
1138                                 upTriFactorT->csrMat->values->data().get(),
1139                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148                           upTriFactor->csrMat->values->data().get(),
1149                           upTriFactor->csrMat->row_offsets->data().get(),
1150                           upTriFactor->csrMat->column_indices->data().get(),
1151                           upTriFactorT->csrMat->values->data().get(),
1152                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156                         #else
1157                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159                         #endif
1160 
1161   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163 
1164   /* Create the solve analysis information */
1165   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1167   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174   #endif
1175 
1176   /* perform the solve analysis */
1177   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                            upTriFactorT->csrMat->column_indices->data().get(),
1181                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182                            upTriFactorT->solveInfo,
1183                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184                           #else
1185                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186                           #endif
1187 
1188   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190 
1191   /* assign the pointer */
1192   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 struct PetscScalarToPetscInt
1197 {
1198   __host__ __device__
1199   PetscInt operator()(PetscScalar s)
1200   {
1201     return (PetscInt)PetscRealPart(s);
1202   }
1203 };
1204 
1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206 {
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   cusparseStatus_t             stat;
1211   cusparseIndexBase_t          indexBase;
1212   cudaError_t                  err;
1213   PetscErrorCode               ierr;
1214 
1215   PetscFunctionBegin;
1216   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1222   if (A->transupdated) PetscFunctionReturn(0);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227   }
1228   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1230     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1232     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1233     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234 
1235     /* set alpha and beta */
1236     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1237     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1238     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1240     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242 
1243     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244       CsrMatrix *matrixT = new CsrMatrix;
1245       matstructT->mat = matrixT;
1246       matrixT->num_rows = A->cmap->n;
1247       matrixT->num_cols = A->rmap->n;
1248       matrixT->num_entries = a->nz;
1249       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251       matrixT->values = new THRUSTARRAY(a->nz);
1252 
1253       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1254       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255 
1256      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257       stat = cusparseCreateCsr(&matstructT->matDescr,
1258                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260                                matrixT->values->data().get(),
1261                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263      #endif
1264     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267    #else
1268       CsrMatrix *temp  = new CsrMatrix;
1269       CsrMatrix *tempT = new CsrMatrix;
1270       /* First convert HYB to CSR */
1271       temp->num_rows = A->rmap->n;
1272       temp->num_cols = A->cmap->n;
1273       temp->num_entries = a->nz;
1274       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276       temp->values = new THRUSTARRAY(a->nz);
1277 
1278       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280                               temp->values->data().get(),
1281                               temp->row_offsets->data().get(),
1282                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283 
1284       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285       tempT->num_rows = A->rmap->n;
1286       tempT->num_cols = A->cmap->n;
1287       tempT->num_entries = a->nz;
1288       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290       tempT->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293                               temp->num_cols, temp->num_entries,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get(),
1297                               tempT->values->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               tempT->row_offsets->data().get(),
1300                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301 
1302       /* Last, convert CSC to HYB */
1303       cusparseHybMat_t hybMat;
1304       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308                               matstructT->descr, tempT->values->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               tempT->column_indices->data().get(),
1311                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312 
1313       /* assign the pointer */
1314       matstructT->mat = hybMat;
1315       A->transupdated = PETSC_TRUE;
1316       /* delete temporaries */
1317       if (tempT) {
1318         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321         delete (CsrMatrix*) tempT;
1322       }
1323       if (temp) {
1324         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327         delete (CsrMatrix*) temp;
1328       }
1329      #endif
1330     }
1331   }
1332   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347     }
1348     if (!cusparsestruct->csr2csc_i) {
1349       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351 
1352       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354       void   *csr2cscBuffer;
1355       size_t csr2cscBufferSize;
1356       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357                                            A->cmap->n, matrix->num_entries,
1358                                            matrix->values->data().get(),
1359                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360                                            matrix->column_indices->data().get(),
1361                                            matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366      #endif
1367 
1368       if (matrix->num_entries) {
1369         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1370            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1371            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372 
1373            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1374            should be filled with indexBase. So I just take a shortcut here.
1375         */
1376         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1377                               A->cmap->n,matrix->num_entries,
1378                               csr2csc_a.data().get(),
1379                               cusparsestruct->rowoffsets_gpu->data().get(),
1380                               matrix->column_indices->data().get(),
1381                               matrixT->values->data().get(),
1382                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384                               CUSPARSE_ACTION_NUMERIC,indexBase,
1385                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386                              #else
1387                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1388                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389                              #endif
1390       } else {
1391         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1392       }
1393 
1394       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398      #endif
1399     }
1400     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402                                                      matrixT->values->begin()));
1403   }
1404   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406   /* the compressed row indices is not used for matTranspose */
1407   matstructT->cprowIndices = NULL;
1408   /* assign the pointer */
1409   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1410   A->transupdated = PETSC_TRUE;
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416 {
1417   PetscInt                              n = xx->map->n;
1418   const PetscScalar                     *barray;
1419   PetscScalar                           *xarray;
1420   thrust::device_ptr<const PetscScalar> bGPU;
1421   thrust::device_ptr<PetscScalar>       xGPU;
1422   cusparseStatus_t                      stat;
1423   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427   PetscErrorCode                        ierr;
1428 
1429   PetscFunctionBegin;
1430   /* Analyze the matrix and create the transpose ... on the fly */
1431   if (!loTriFactorT && !upTriFactorT) {
1432     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435   }
1436 
1437   /* Get the GPU pointers */
1438   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440   xGPU = thrust::device_pointer_cast(xarray);
1441   bGPU = thrust::device_pointer_cast(barray);
1442 
1443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444   /* First, reorder with the row permutation */
1445   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447                xGPU);
1448 
1449   /* First, solve U */
1450   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451                         upTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         upTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456                         upTriFactorT->csrMat->values->data().get(),
1457                         upTriFactorT->csrMat->row_offsets->data().get(),
1458                         upTriFactorT->csrMat->column_indices->data().get(),
1459                         upTriFactorT->solveInfo,
1460                         xarray,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         tempGPU->data().get(),
1463                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464                       #else
1465                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466                       #endif
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(),
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         xarray,
1482                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483                       #else
1484                          xarray);CHKERRCUSPARSE(stat);
1485                       #endif
1486 
1487   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490                tempGPU->begin());
1491 
1492   /* Copy the temporary to the full solution. */
1493   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494 
1495   /* restore */
1496   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504 {
1505   const PetscScalar                 *barray;
1506   PetscScalar                       *xarray;
1507   cusparseStatus_t                  stat;
1508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512   PetscErrorCode                    ierr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray,
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         tempGPU->data().get(),
1541                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542                       #else
1543                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544                       #endif
1545 
1546   /* Then, solve L */
1547   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548                         loTriFactorT->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactorT->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553                         loTriFactorT->csrMat->values->data().get(),
1554                         loTriFactorT->csrMat->row_offsets->data().get(),
1555                         loTriFactorT->csrMat->column_indices->data().get(),
1556                         loTriFactorT->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                         xarray,
1560                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561                       #else
1562                         xarray);CHKERRCUSPARSE(stat);
1563                       #endif
1564 
1565   /* restore */
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                     *barray;
1576   PetscScalar                           *xarray;
1577   thrust::device_ptr<const PetscScalar> bGPU;
1578   thrust::device_ptr<PetscScalar>       xGPU;
1579   cusparseStatus_t                      stat;
1580   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584   PetscErrorCode                        ierr;
1585 
1586   PetscFunctionBegin;
1587 
1588   /* Get the GPU pointers */
1589   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591   xGPU = thrust::device_pointer_cast(xarray);
1592   bGPU = thrust::device_pointer_cast(barray);
1593 
1594   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595   /* First, reorder with the row permutation */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1598                tempGPU->begin());
1599 
1600   /* Next, solve L */
1601   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602                         loTriFactor->csrMat->num_rows,
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         loTriFactor->csrMat->num_entries,
1605                       #endif
1606                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607                         loTriFactor->csrMat->values->data().get(),
1608                         loTriFactor->csrMat->row_offsets->data().get(),
1609                         loTriFactor->csrMat->column_indices->data().get(),
1610                         loTriFactor->solveInfo,
1611                         tempGPU->data().get(),
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                          xarray,
1614                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615                       #else
1616                          xarray);CHKERRCUSPARSE(stat);
1617                       #endif
1618 
1619   /* Then, solve U */
1620   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621                         upTriFactor->csrMat->num_rows,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         upTriFactor->csrMat->num_entries,
1624                       #endif
1625                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626                         upTriFactor->csrMat->values->data().get(),
1627                         upTriFactor->csrMat->row_offsets->data().get(),
1628                         upTriFactor->csrMat->column_indices->data().get(),
1629                         upTriFactor->solveInfo,xarray,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         tempGPU->data().get(),
1632                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633                       #else
1634                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635                       #endif
1636 
1637   /* Last, reorder with the column permutation */
1638   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1639                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1640                xGPU);
1641 
1642   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1650 {
1651   const PetscScalar                 *barray;
1652   PetscScalar                       *xarray;
1653   cusparseStatus_t                  stat;
1654   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1655   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1657   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1658   PetscErrorCode                    ierr;
1659 
1660   PetscFunctionBegin;
1661   /* Get the GPU pointers */
1662   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1663   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1664 
1665   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1666   /* First, solve L */
1667   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1668                         loTriFactor->csrMat->num_rows,
1669                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670                         loTriFactor->csrMat->num_entries,
1671                       #endif
1672                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1673                         loTriFactor->csrMat->values->data().get(),
1674                         loTriFactor->csrMat->row_offsets->data().get(),
1675                         loTriFactor->csrMat->column_indices->data().get(),
1676                         loTriFactor->solveInfo,
1677                         barray,
1678                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679                         tempGPU->data().get(),
1680                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1681                       #else
1682                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1683                       #endif
1684 
1685   /* Next, solve U */
1686   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687                         upTriFactor->csrMat->num_rows,
1688                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689                         upTriFactor->csrMat->num_entries,
1690                       #endif
1691                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692                         upTriFactor->csrMat->values->data().get(),
1693                         upTriFactor->csrMat->row_offsets->data().get(),
1694                         upTriFactor->csrMat->column_indices->data().get(),
1695                         upTriFactor->solveInfo,
1696                         tempGPU->data().get(),
1697                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698                         xarray,
1699                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1700                       #else
1701                         xarray);CHKERRCUSPARSE(stat);
1702                       #endif
1703 
1704   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1705   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1706   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1707   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1708   PetscFunctionReturn(0);
1709 }
1710 
1711 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1712 {
1713   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1714   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1715   cudaError_t        cerr;
1716   PetscErrorCode     ierr;
1717 
1718   PetscFunctionBegin;
1719   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1720     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1721 
1722     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1723     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1724     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1725     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1726     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1727     A->offloadmask = PETSC_OFFLOAD_BOTH;
1728   }
1729   PetscFunctionReturn(0);
1730 }
1731 
1732 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1733 {
1734   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1735   PetscErrorCode ierr;
1736 
1737   PetscFunctionBegin;
1738   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1739   *array = a->a;
1740   A->offloadmask = PETSC_OFFLOAD_CPU;
1741   PetscFunctionReturn(0);
1742 }
1743 
1744 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1745 {
1746   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1747   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1748   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1749   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1750   PetscErrorCode               ierr;
1751   cusparseStatus_t             stat;
1752   PetscBool                    both = PETSC_TRUE;
1753   cudaError_t                  err;
1754 
1755   PetscFunctionBegin;
1756   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1757   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1758     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1759       CsrMatrix *matrix;
1760       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1761 
1762       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1763       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1764       matrix->values->assign(a->a, a->a+a->nz);
1765       err  = WaitForCUDA();CHKERRCUDA(err);
1766       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1767       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1768       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1769     } else {
1770       PetscInt nnz;
1771       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1772       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1773       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1774       delete cusparsestruct->workVector;
1775       delete cusparsestruct->rowoffsets_gpu;
1776       cusparsestruct->workVector = NULL;
1777       cusparsestruct->rowoffsets_gpu = NULL;
1778       try {
1779         if (a->compressedrow.use) {
1780           m    = a->compressedrow.nrows;
1781           ii   = a->compressedrow.i;
1782           ridx = a->compressedrow.rindex;
1783         } else {
1784           m    = A->rmap->n;
1785           ii   = a->i;
1786           ridx = NULL;
1787         }
1788         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1789         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1790         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1791         else nnz = a->nz;
1792 
1793         /* create cusparse matrix */
1794         cusparsestruct->nrows = m;
1795         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1796         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1797         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1798         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1799 
1800         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1801         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1802         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1803         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1804         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1805         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1806         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1807 
1808         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1809         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1810           /* set the matrix */
1811           CsrMatrix *mat= new CsrMatrix;
1812           mat->num_rows = m;
1813           mat->num_cols = A->cmap->n;
1814           mat->num_entries = nnz;
1815           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1816           mat->row_offsets->assign(ii, ii + m+1);
1817 
1818           mat->column_indices = new THRUSTINTARRAY32(nnz);
1819           mat->column_indices->assign(a->j, a->j+nnz);
1820 
1821           mat->values = new THRUSTARRAY(nnz);
1822           if (a->a) mat->values->assign(a->a, a->a+nnz);
1823 
1824           /* assign the pointer */
1825           matstruct->mat = mat;
1826          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1827           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1828             stat = cusparseCreateCsr(&matstruct->matDescr,
1829                                     mat->num_rows, mat->num_cols, mat->num_entries,
1830                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1831                                     mat->values->data().get(),
1832                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1833                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1834           }
1835          #endif
1836         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1837          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1838           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1839          #else
1840           CsrMatrix *mat= new CsrMatrix;
1841           mat->num_rows = m;
1842           mat->num_cols = A->cmap->n;
1843           mat->num_entries = nnz;
1844           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1845           mat->row_offsets->assign(ii, ii + m+1);
1846 
1847           mat->column_indices = new THRUSTINTARRAY32(nnz);
1848           mat->column_indices->assign(a->j, a->j+nnz);
1849 
1850           mat->values = new THRUSTARRAY(nnz);
1851           if (a->a) mat->values->assign(a->a, a->a+nnz);
1852 
1853           cusparseHybMat_t hybMat;
1854           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1855           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1856             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1857           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1858               matstruct->descr, mat->values->data().get(),
1859               mat->row_offsets->data().get(),
1860               mat->column_indices->data().get(),
1861               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1862           /* assign the pointer */
1863           matstruct->mat = hybMat;
1864 
1865           if (mat) {
1866             if (mat->values) delete (THRUSTARRAY*)mat->values;
1867             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1868             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1869             delete (CsrMatrix*)mat;
1870           }
1871          #endif
1872         }
1873 
1874         /* assign the compressed row indices */
1875         if (a->compressedrow.use) {
1876           cusparsestruct->workVector = new THRUSTARRAY(m);
1877           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1878           matstruct->cprowIndices->assign(ridx,ridx+m);
1879           tmp = m;
1880         } else {
1881           cusparsestruct->workVector = NULL;
1882           matstruct->cprowIndices    = NULL;
1883           tmp = 0;
1884         }
1885         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1886 
1887         /* assign the pointer */
1888         cusparsestruct->mat = matstruct;
1889       } catch(char *ex) {
1890         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1891       }
1892       err  = WaitForCUDA();CHKERRCUDA(err);
1893       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1894       cusparsestruct->nonzerostate = A->nonzerostate;
1895     }
1896     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1897   }
1898   PetscFunctionReturn(0);
1899 }
1900 
1901 struct VecCUDAPlusEquals
1902 {
1903   template <typename Tuple>
1904   __host__ __device__
1905   void operator()(Tuple t)
1906   {
1907     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1908   }
1909 };
1910 
1911 struct VecCUDAEquals
1912 {
1913   template <typename Tuple>
1914   __host__ __device__
1915   void operator()(Tuple t)
1916   {
1917     thrust::get<1>(t) = thrust::get<0>(t);
1918   }
1919 };
1920 
1921 struct VecCUDAEqualsReverse
1922 {
1923   template <typename Tuple>
1924   __host__ __device__
1925   void operator()(Tuple t)
1926   {
1927     thrust::get<0>(t) = thrust::get<1>(t);
1928   }
1929 };
1930 
1931 struct MatMatCusparse {
1932   PetscBool             cisdense;
1933   PetscScalar           *Bt;
1934   Mat                   X;
1935   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1936   PetscLogDouble        flops;
1937   CsrMatrix             *Bcsr;
1938 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1939   cusparseSpMatDescr_t  matSpBDescr;
1940   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1941   cusparseDnMatDescr_t  matBDescr;
1942   cusparseDnMatDescr_t  matCDescr;
1943   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1944   size_t                mmBufferSize;
1945   void                  *mmBuffer;
1946   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1947   cusparseSpGEMMDescr_t spgemmDesc;
1948 #endif
1949 };
1950 
1951 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1952 {
1953   PetscErrorCode   ierr;
1954   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1955   cudaError_t      cerr;
1956  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1957   cusparseStatus_t stat;
1958  #endif
1959 
1960   PetscFunctionBegin;
1961   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1962   delete mmdata->Bcsr;
1963  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1965   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1966   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1967   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1968   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1969   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1970  #endif
1971   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1972   ierr = PetscFree(data);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1977 
1978 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1979 {
1980   Mat_Product                  *product = C->product;
1981   Mat                          A,B;
1982   PetscInt                     m,n,blda,clda;
1983   PetscBool                    flg,biscuda;
1984   Mat_SeqAIJCUSPARSE           *cusp;
1985   cusparseStatus_t             stat;
1986   cusparseOperation_t          opA;
1987   const PetscScalar            *barray;
1988   PetscScalar                  *carray;
1989   PetscErrorCode               ierr;
1990   MatMatCusparse               *mmdata;
1991   Mat_SeqAIJCUSPARSEMultStruct *mat;
1992   CsrMatrix                    *csrmat;
1993 
1994   PetscFunctionBegin;
1995   MatCheckProduct(C,1);
1996   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1997   mmdata = (MatMatCusparse*)product->data;
1998   A    = product->A;
1999   B    = product->B;
2000   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2001   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2002   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2003      Instead of silently accepting the wrong answer, I prefer to raise the error */
2004   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2005   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2006   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2007   switch (product->type) {
2008   case MATPRODUCT_AB:
2009   case MATPRODUCT_PtAP:
2010     mat = cusp->mat;
2011     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012     m   = A->rmap->n;
2013     n   = B->cmap->n;
2014     break;
2015   case MATPRODUCT_AtB:
2016     if (!A->form_explicit_transpose) {
2017       mat = cusp->mat;
2018       opA = CUSPARSE_OPERATION_TRANSPOSE;
2019     } else {
2020       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2021       mat  = cusp->matTranspose;
2022       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2023     }
2024     m = A->cmap->n;
2025     n = B->cmap->n;
2026     break;
2027   case MATPRODUCT_ABt:
2028   case MATPRODUCT_RARt:
2029     mat = cusp->mat;
2030     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2031     m   = A->rmap->n;
2032     n   = B->rmap->n;
2033     break;
2034   default:
2035     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2036   }
2037   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2038   csrmat = (CsrMatrix*)mat->mat;
2039   /* if the user passed a CPU matrix, copy the data to the GPU */
2040   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2041   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2042   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2043 
2044   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2045   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2046     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2047     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2048   } else {
2049     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2050     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2051   }
2052 
2053   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2054  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2055   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2056   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2057   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2058     size_t mmBufferSize;
2059     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2060     if (!mmdata->matBDescr) {
2061       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2062       mmdata->Blda = blda;
2063     }
2064 
2065     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2066     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2067       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2068       mmdata->Clda = clda;
2069     }
2070 
2071     if (!mat->matDescr) {
2072       stat = cusparseCreateCsr(&mat->matDescr,
2073                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2074                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2075                                csrmat->values->data().get(),
2076                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2077                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2078     }
2079     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2080                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081                                    mmdata->matCDescr,cusparse_scalartype,
2082                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2083     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2084       cudaError_t cerr;
2085       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2086       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2087       mmdata->mmBufferSize = mmBufferSize;
2088     }
2089     mmdata->initialized = PETSC_TRUE;
2090   } else {
2091     /* to be safe, always update pointers of the mats */
2092     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2093     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2094     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2095   }
2096 
2097   /* do cusparseSpMM, which supports transpose on B */
2098   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2099                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2100                       mmdata->matCDescr,cusparse_scalartype,
2101                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2102  #else
2103   PetscInt k;
2104   /* cusparseXcsrmm does not support transpose on B */
2105   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2106     cublasHandle_t cublasv2handle;
2107     cublasStatus_t cerr;
2108 
2109     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2110     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2111                        B->cmap->n,B->rmap->n,
2112                        &PETSC_CUSPARSE_ONE ,barray,blda,
2113                        &PETSC_CUSPARSE_ZERO,barray,blda,
2114                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2115     blda = B->cmap->n;
2116     k    = B->cmap->n;
2117   } else {
2118     k    = B->rmap->n;
2119   }
2120 
2121   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2122   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2123                            csrmat->num_entries,mat->alpha_one,mat->descr,
2124                            csrmat->values->data().get(),
2125                            csrmat->row_offsets->data().get(),
2126                            csrmat->column_indices->data().get(),
2127                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2128                            carray,clda);CHKERRCUSPARSE(stat);
2129  #endif
2130   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2131   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2132   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2133   if (product->type == MATPRODUCT_RARt) {
2134     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2135     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2136   } else if (product->type == MATPRODUCT_PtAP) {
2137     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2138     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2139   } else {
2140     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2141   }
2142   if (mmdata->cisdense) {
2143     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2144   }
2145   if (!biscuda) {
2146     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2147   }
2148   PetscFunctionReturn(0);
2149 }
2150 
2151 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2152 {
2153   Mat_Product        *product = C->product;
2154   Mat                A,B;
2155   PetscInt           m,n;
2156   PetscBool          cisdense,flg;
2157   PetscErrorCode     ierr;
2158   MatMatCusparse     *mmdata;
2159   Mat_SeqAIJCUSPARSE *cusp;
2160 
2161   PetscFunctionBegin;
2162   MatCheckProduct(C,1);
2163   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2164   A    = product->A;
2165   B    = product->B;
2166   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2167   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2168   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2169   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2170   switch (product->type) {
2171   case MATPRODUCT_AB:
2172     m = A->rmap->n;
2173     n = B->cmap->n;
2174     break;
2175   case MATPRODUCT_AtB:
2176     m = A->cmap->n;
2177     n = B->cmap->n;
2178     break;
2179   case MATPRODUCT_ABt:
2180     m = A->rmap->n;
2181     n = B->rmap->n;
2182     break;
2183   case MATPRODUCT_PtAP:
2184     m = B->cmap->n;
2185     n = B->cmap->n;
2186     break;
2187   case MATPRODUCT_RARt:
2188     m = B->rmap->n;
2189     n = B->rmap->n;
2190     break;
2191   default:
2192     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2193   }
2194   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2195   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2196   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2197   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2198 
2199   /* product data */
2200   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2201   mmdata->cisdense = cisdense;
2202  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2203   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2204   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2205     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2206   }
2207  #endif
2208   /* for these products we need intermediate storage */
2209   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2210     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2211     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2212     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2213       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2214     } else {
2215       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2216     }
2217   }
2218   C->product->data    = mmdata;
2219   C->product->destroy = MatDestroy_MatMatCusparse;
2220 
2221   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2222   PetscFunctionReturn(0);
2223 }
2224 
2225 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2226 {
2227   Mat_Product                  *product = C->product;
2228   Mat                          A,B;
2229   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2230   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2231   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2232   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2233   PetscBool                    flg;
2234   PetscErrorCode               ierr;
2235   cusparseStatus_t             stat;
2236   cudaError_t                  cerr;
2237   MatProductType               ptype;
2238   MatMatCusparse               *mmdata;
2239 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240   cusparseSpMatDescr_t         BmatSpDescr;
2241 #endif
2242 
2243   PetscFunctionBegin;
2244   MatCheckProduct(C,1);
2245   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2246   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2248   mmdata = (MatMatCusparse*)C->product->data;
2249   A = product->A;
2250   B = product->B;
2251   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2252     mmdata->reusesym = PETSC_FALSE;
2253     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2254     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2255     Cmat = Ccusp->mat;
2256     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2257     Ccsr = (CsrMatrix*)Cmat->mat;
2258     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2259     goto finalize;
2260   }
2261   if (!c->nz) goto finalize;
2262   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2263   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2264   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2265   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2266   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2267   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2268   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2269   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2270   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2271   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2272   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2273   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2274   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2275   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2276 
2277   ptype = product->type;
2278   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2279   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2280   switch (ptype) {
2281   case MATPRODUCT_AB:
2282     Amat = Acusp->mat;
2283     Bmat = Bcusp->mat;
2284     break;
2285   case MATPRODUCT_AtB:
2286     Amat = Acusp->matTranspose;
2287     Bmat = Bcusp->mat;
2288     break;
2289   case MATPRODUCT_ABt:
2290     Amat = Acusp->mat;
2291     Bmat = Bcusp->matTranspose;
2292     break;
2293   default:
2294     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2295   }
2296   Cmat = Ccusp->mat;
2297   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2298   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2299   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2300   Acsr = (CsrMatrix*)Amat->mat;
2301   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2302   Ccsr = (CsrMatrix*)Cmat->mat;
2303   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2304   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2305   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2306   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2307 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2308   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2309   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2310                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2311                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2312                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2313   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2314                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2315                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2316 #else
2317   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2318                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2319                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2320                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2321                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2322 #endif
2323   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2324   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2325   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2326   C->offloadmask = PETSC_OFFLOAD_GPU;
2327 finalize:
2328   /* shorter version of MatAssemblyEnd_SeqAIJ */
2329   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2330   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2331   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2332   c->reallocs         = 0;
2333   C->info.mallocs    += 0;
2334   C->info.nz_unneeded = 0;
2335   C->assembled = C->was_assembled = PETSC_TRUE;
2336   C->num_ass++;
2337   PetscFunctionReturn(0);
2338 }
2339 
2340 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2341 {
2342   Mat_Product                  *product = C->product;
2343   Mat                          A,B;
2344   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2345   Mat_SeqAIJ                   *a,*b,*c;
2346   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2347   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2348   PetscInt                     i,j,m,n,k;
2349   PetscBool                    flg;
2350   PetscErrorCode               ierr;
2351   cusparseStatus_t             stat;
2352   cudaError_t                  cerr;
2353   MatProductType               ptype;
2354   MatMatCusparse               *mmdata;
2355   PetscLogDouble               flops;
2356   PetscBool                    biscompressed,ciscompressed;
2357 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2359   size_t                       bufSize2;
2360   cusparseSpMatDescr_t         BmatSpDescr;
2361 #else
2362   int                          cnz;
2363 #endif
2364 
2365   PetscFunctionBegin;
2366   MatCheckProduct(C,1);
2367   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2368   A    = product->A;
2369   B    = product->B;
2370   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2371   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2372   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2373   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2374   a = (Mat_SeqAIJ*)A->data;
2375   b = (Mat_SeqAIJ*)B->data;
2376   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2377   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2378   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2379   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2380 
2381   /* product data */
2382   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2383   C->product->data    = mmdata;
2384   C->product->destroy = MatDestroy_MatMatCusparse;
2385 
2386   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2387   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2388   ptype = product->type;
2389   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2390   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2391   biscompressed = PETSC_FALSE;
2392   ciscompressed = PETSC_FALSE;
2393   switch (ptype) {
2394   case MATPRODUCT_AB:
2395     m = A->rmap->n;
2396     n = B->cmap->n;
2397     k = A->cmap->n;
2398     Amat = Acusp->mat;
2399     Bmat = Bcusp->mat;
2400     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2402     break;
2403   case MATPRODUCT_AtB:
2404     m = A->cmap->n;
2405     n = B->cmap->n;
2406     k = A->rmap->n;
2407     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2408     Amat = Acusp->matTranspose;
2409     Bmat = Bcusp->mat;
2410     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2411     break;
2412   case MATPRODUCT_ABt:
2413     m = A->rmap->n;
2414     n = B->rmap->n;
2415     k = A->cmap->n;
2416     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2417     Amat = Acusp->mat;
2418     Bmat = Bcusp->matTranspose;
2419     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2420     break;
2421   default:
2422     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2423   }
2424 
2425   /* create cusparse matrix */
2426   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2427   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2428   c     = (Mat_SeqAIJ*)C->data;
2429   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2430   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2431   Ccsr  = new CsrMatrix;
2432 
2433   c->compressedrow.use = ciscompressed;
2434   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2435     c->compressedrow.nrows = a->compressedrow.nrows;
2436     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2437     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2438     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2439     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2440     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2441   } else {
2442     c->compressedrow.nrows  = 0;
2443     c->compressedrow.i      = NULL;
2444     c->compressedrow.rindex = NULL;
2445     Ccusp->workVector       = NULL;
2446     Cmat->cprowIndices      = NULL;
2447   }
2448   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2449   Ccusp->mat      = Cmat;
2450   Ccusp->mat->mat = Ccsr;
2451   Ccsr->num_rows    = Ccusp->nrows;
2452   Ccsr->num_cols    = n;
2453   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2454   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2455   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2456   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2457   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2458   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2459   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2460   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2461   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2462   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2463   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2464     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2465     c->nz = 0;
2466     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2467     Ccsr->values = new THRUSTARRAY(c->nz);
2468     goto finalizesym;
2469   }
2470 
2471   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2472   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2473   Acsr = (CsrMatrix*)Amat->mat;
2474   if (!biscompressed) {
2475     Bcsr = (CsrMatrix*)Bmat->mat;
2476 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477     BmatSpDescr = Bmat->matDescr;
2478 #endif
2479   } else { /* we need to use row offsets for the full matrix */
2480     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2481     Bcsr = new CsrMatrix;
2482     Bcsr->num_rows       = B->rmap->n;
2483     Bcsr->num_cols       = cBcsr->num_cols;
2484     Bcsr->num_entries    = cBcsr->num_entries;
2485     Bcsr->column_indices = cBcsr->column_indices;
2486     Bcsr->values         = cBcsr->values;
2487     if (!Bcusp->rowoffsets_gpu) {
2488       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2489       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2490       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2491     }
2492     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2493     mmdata->Bcsr = Bcsr;
2494 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495     if (Bcsr->num_rows && Bcsr->num_cols) {
2496       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2497                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2498                                Bcsr->values->data().get(),
2499                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2500                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2501     }
2502     BmatSpDescr = mmdata->matSpBDescr;
2503 #endif
2504   }
2505   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2506   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2507   /* precompute flops count */
2508   if (ptype == MATPRODUCT_AB) {
2509     for (i=0, flops = 0; i<A->rmap->n; i++) {
2510       const PetscInt st = a->i[i];
2511       const PetscInt en = a->i[i+1];
2512       for (j=st; j<en; j++) {
2513         const PetscInt brow = a->j[j];
2514         flops += 2.*(b->i[brow+1] - b->i[brow]);
2515       }
2516     }
2517   } else if (ptype == MATPRODUCT_AtB) {
2518     for (i=0, flops = 0; i<A->rmap->n; i++) {
2519       const PetscInt anzi = a->i[i+1] - a->i[i];
2520       const PetscInt bnzi = b->i[i+1] - b->i[i];
2521       flops += (2.*anzi)*bnzi;
2522     }
2523   } else { /* TODO */
2524     flops = 0.;
2525   }
2526 
2527   mmdata->flops = flops;
2528   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2529 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2530   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2531   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2532                            NULL, NULL, NULL,
2533                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2534                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2535   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2536   /* ask bufferSize bytes for external memory */
2537   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2539                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2540                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2541   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2542   /* inspect the matrices A and B to understand the memory requirement for the next step */
2543   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2547   /* ask bufferSize again bytes for external memory */
2548   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2551                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2552   /* The CUSPARSE documentation is not clear, nor the API
2553      We need both buffers to perform the operations properly!
2554      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2555      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2556      is stored in the descriptor! What a messy API... */
2557   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2558   /* compute the intermediate product of A * B */
2559   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2561                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2562                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2563   /* get matrix C non-zero entries C_nnz1 */
2564   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2565   c->nz = (PetscInt) C_nnz1;
2566   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2567   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2568   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569   Ccsr->values = new THRUSTARRAY(c->nz);
2570   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2572                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2573   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2575                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2576 #else
2577   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2578   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2583   c->nz = cnz;
2584   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2585   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2586   Ccsr->values = new THRUSTARRAY(c->nz);
2587   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2588 
2589   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2590   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2591      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2592      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2593   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2594                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2595                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2596                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2597                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2598 #endif
2599   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2600   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2601 finalizesym:
2602   c->singlemalloc = PETSC_FALSE;
2603   c->free_a       = PETSC_TRUE;
2604   c->free_ij      = PETSC_TRUE;
2605   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2606   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2607   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2608     PetscInt *d_i = c->i;
2609     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2610     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2611     ii   = *Ccsr->row_offsets;
2612     jj   = *Ccsr->column_indices;
2613     if (ciscompressed) d_i = c->compressedrow.i;
2614     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2615     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616   } else {
2617     PetscInt *d_i = c->i;
2618     if (ciscompressed) d_i = c->compressedrow.i;
2619     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2620     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2621   }
2622   if (ciscompressed) { /* need to expand host row offsets */
2623     PetscInt r = 0;
2624     c->i[0] = 0;
2625     for (k = 0; k < c->compressedrow.nrows; k++) {
2626       const PetscInt next = c->compressedrow.rindex[k];
2627       const PetscInt old = c->compressedrow.i[k];
2628       for (; r < next; r++) c->i[r+1] = old;
2629     }
2630     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2631   }
2632   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2633   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2634   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2635   c->maxnz = c->nz;
2636   c->nonzerorowcnt = 0;
2637   c->rmax = 0;
2638   for (k = 0; k < m; k++) {
2639     const PetscInt nn = c->i[k+1] - c->i[k];
2640     c->ilen[k] = c->imax[k] = nn;
2641     c->nonzerorowcnt += (PetscInt)!!nn;
2642     c->rmax = PetscMax(c->rmax,nn);
2643   }
2644   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2645   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2646   Ccsr->num_entries = c->nz;
2647 
2648   C->nonzerostate++;
2649   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2650   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2651   Ccusp->nonzerostate = C->nonzerostate;
2652   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2653   C->preallocated  = PETSC_TRUE;
2654   C->assembled     = PETSC_FALSE;
2655   C->was_assembled = PETSC_FALSE;
2656   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2657     mmdata->reusesym = PETSC_TRUE;
2658     C->offloadmask   = PETSC_OFFLOAD_GPU;
2659   }
2660   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2661   PetscFunctionReturn(0);
2662 }
2663 
2664 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2665 
2666 /* handles sparse or dense B */
2667 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2668 {
2669   Mat_Product    *product = mat->product;
2670   PetscErrorCode ierr;
2671   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2672 
2673   PetscFunctionBegin;
2674   MatCheckProduct(mat,1);
2675   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2676   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2677     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2678   }
2679   if (product->type == MATPRODUCT_ABC) {
2680     Ciscusp = PETSC_FALSE;
2681     if (!product->C->boundtocpu) {
2682       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2683     }
2684   }
2685   if (isdense) {
2686     switch (product->type) {
2687     case MATPRODUCT_AB:
2688     case MATPRODUCT_AtB:
2689     case MATPRODUCT_ABt:
2690     case MATPRODUCT_PtAP:
2691     case MATPRODUCT_RARt:
2692      if (product->A->boundtocpu) {
2693         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2694       } else {
2695         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2696       }
2697       break;
2698     case MATPRODUCT_ABC:
2699       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2700       break;
2701     default:
2702       break;
2703     }
2704   } else if (Biscusp && Ciscusp) {
2705     switch (product->type) {
2706     case MATPRODUCT_AB:
2707     case MATPRODUCT_AtB:
2708     case MATPRODUCT_ABt:
2709       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2710       break;
2711     case MATPRODUCT_PtAP:
2712     case MATPRODUCT_RARt:
2713     case MATPRODUCT_ABC:
2714       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2715       break;
2716     default:
2717       break;
2718     }
2719   } else { /* fallback for AIJ */
2720     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2721   }
2722   PetscFunctionReturn(0);
2723 }
2724 
2725 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2726 {
2727   PetscErrorCode ierr;
2728 
2729   PetscFunctionBegin;
2730   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2731   PetscFunctionReturn(0);
2732 }
2733 
2734 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2735 {
2736   PetscErrorCode ierr;
2737 
2738   PetscFunctionBegin;
2739   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2740   PetscFunctionReturn(0);
2741 }
2742 
2743 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2744 {
2745   PetscErrorCode ierr;
2746 
2747   PetscFunctionBegin;
2748   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2749   PetscFunctionReturn(0);
2750 }
2751 
2752 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2753 {
2754   PetscErrorCode ierr;
2755 
2756   PetscFunctionBegin;
2757   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2758   PetscFunctionReturn(0);
2759 }
2760 
2761 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2762 {
2763   PetscErrorCode ierr;
2764 
2765   PetscFunctionBegin;
2766   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2767   PetscFunctionReturn(0);
2768 }
2769 
2770 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2771 {
2772   int i = blockIdx.x*blockDim.x + threadIdx.x;
2773   if (i < n) y[idx[i]] += x[i];
2774 }
2775 
2776 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2777 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2778 {
2779   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2780   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2781   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2782   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2783   PetscErrorCode               ierr;
2784   cusparseStatus_t             stat;
2785   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2786   PetscBool                    compressed;
2787 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2788   PetscInt                     nx,ny;
2789 #endif
2790 
2791   PetscFunctionBegin;
2792   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2793   if (!a->nonzerorowcnt) {
2794     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2795     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2796     PetscFunctionReturn(0);
2797   }
2798   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2799   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2800   if (!trans) {
2801     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2802     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2803   } else {
2804     if (herm || !A->form_explicit_transpose) {
2805       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2806       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2807     } else {
2808       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2809       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2810     }
2811   }
2812   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2813   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2814 
2815   try {
2816     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2817     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2818     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2819 
2820     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2821     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2822       /* z = A x + beta y.
2823          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2824          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2825       */
2826       xptr = xarray;
2827       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2828       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2829      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2830       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2831           allocated to accommodate different uses. So we get the length info directly from mat.
2832        */
2833       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2834         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2835         nx = mat->num_cols;
2836         ny = mat->num_rows;
2837       }
2838      #endif
2839     } else {
2840       /* z = A^T x + beta y
2841          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2842          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2843        */
2844       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2845       dptr = zarray;
2846       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2847       if (compressed) { /* Scatter x to work vector */
2848         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2849         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2850                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2851                          VecCUDAEqualsReverse());
2852       }
2853      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2854       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2855         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2856         nx = mat->num_rows;
2857         ny = mat->num_cols;
2858       }
2859      #endif
2860     }
2861 
2862     /* csr_spmv does y = alpha op(A) x + beta y */
2863     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2864      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2865       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2866       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2867         cudaError_t cerr;
2868         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2869         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2870         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2871                                 matstruct->matDescr,
2872                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2873                                 matstruct->cuSpMV[opA].vecYDescr,
2874                                 cusparse_scalartype,
2875                                 cusparsestruct->spmvAlg,
2876                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2877         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2878 
2879         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2880       } else {
2881         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2882         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2883         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2884       }
2885 
2886       stat = cusparseSpMV(cusparsestruct->handle, opA,
2887                                matstruct->alpha_one,
2888                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2889                                matstruct->cuSpMV[opA].vecXDescr,
2890                                beta,
2891                                matstruct->cuSpMV[opA].vecYDescr,
2892                                cusparse_scalartype,
2893                                cusparsestruct->spmvAlg,
2894                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2895      #else
2896       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2897       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2898                                mat->num_rows, mat->num_cols,
2899                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2900                                mat->values->data().get(), mat->row_offsets->data().get(),
2901                                mat->column_indices->data().get(), xptr, beta,
2902                                dptr);CHKERRCUSPARSE(stat);
2903      #endif
2904     } else {
2905       if (cusparsestruct->nrows) {
2906        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2907         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2908        #else
2909         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2910         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2911                                  matstruct->alpha_one, matstruct->descr, hybMat,
2912                                  xptr, beta,
2913                                  dptr);CHKERRCUSPARSE(stat);
2914        #endif
2915       }
2916     }
2917     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2918 
2919     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2920       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2921         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2922           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2923         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2924           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2925         }
2926       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2927         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2928       }
2929 
2930       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2931       if (compressed) {
2932         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2933         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2934            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2935            prevent that. So I just add a ScatterAdd kernel.
2936          */
2937        #if 0
2938         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2939         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2940                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2941                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2942                          VecCUDAPlusEquals());
2943        #else
2944         PetscInt n = matstruct->cprowIndices->size();
2945         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2946        #endif
2947         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2948       }
2949     } else {
2950       if (yy && yy != zz) {
2951         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2952       }
2953     }
2954     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2955     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2956     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2957   } catch(char *ex) {
2958     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2959   }
2960   if (yy) {
2961     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2962   } else {
2963     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2964   }
2965   PetscFunctionReturn(0);
2966 }
2967 
2968 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2969 {
2970   PetscErrorCode ierr;
2971 
2972   PetscFunctionBegin;
2973   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2974   PetscFunctionReturn(0);
2975 }
2976 
2977 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2978 {
2979   PetscErrorCode     ierr;
2980   PetscObjectState   onnz = A->nonzerostate;
2981   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2982 
2983   PetscFunctionBegin;
2984   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
2985   if (onnz != A->nonzerostate && cusp->deviceMat) {
2986     cudaError_t cerr;
2987 
2988     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
2989     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
2990     cusp->deviceMat = NULL;
2991   }
2992   PetscFunctionReturn(0);
2993 }
2994 
2995 /* --------------------------------------------------------------------------------*/
2996 /*@
2997    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2998    (the default parallel PETSc format). This matrix will ultimately pushed down
2999    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3000    assembly performance the user should preallocate the matrix storage by setting
3001    the parameter nz (or the array nnz).  By setting these parameters accurately,
3002    performance during matrix assembly can be increased by more than a factor of 50.
3003 
3004    Collective
3005 
3006    Input Parameters:
3007 +  comm - MPI communicator, set to PETSC_COMM_SELF
3008 .  m - number of rows
3009 .  n - number of columns
3010 .  nz - number of nonzeros per row (same for all rows)
3011 -  nnz - array containing the number of nonzeros in the various rows
3012          (possibly different for each row) or NULL
3013 
3014    Output Parameter:
3015 .  A - the matrix
3016 
3017    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3018    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3019    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3020 
3021    Notes:
3022    If nnz is given then nz is ignored
3023 
3024    The AIJ format (also called the Yale sparse matrix format or
3025    compressed row storage), is fully compatible with standard Fortran 77
3026    storage.  That is, the stored row and column indices can begin at
3027    either one (as in Fortran) or zero.  See the users' manual for details.
3028 
3029    Specify the preallocated storage with either nz or nnz (not both).
3030    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3031    allocation.  For large problems you MUST preallocate memory or you
3032    will get TERRIBLE performance, see the users' manual chapter on matrices.
3033 
3034    By default, this format uses inodes (identical nodes) when possible, to
3035    improve numerical efficiency of matrix-vector products and solves. We
3036    search for consecutive rows with the same nonzero structure, thereby
3037    reusing matrix information to achieve increased efficiency.
3038 
3039    Level: intermediate
3040 
3041 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3042 @*/
3043 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3044 {
3045   PetscErrorCode ierr;
3046 
3047   PetscFunctionBegin;
3048   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3049   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3050   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3051   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3052   PetscFunctionReturn(0);
3053 }
3054 
3055 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3056 {
3057   PetscErrorCode ierr;
3058 
3059   PetscFunctionBegin;
3060   if (A->factortype == MAT_FACTOR_NONE) {
3061     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3062   } else {
3063     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3064   }
3065   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3066   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3067   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3068   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3069   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3070   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3071   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3072   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3073   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3074   PetscFunctionReturn(0);
3075 }
3076 
3077 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3078 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3079 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3080 {
3081   PetscErrorCode ierr;
3082 
3083   PetscFunctionBegin;
3084   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3085   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3086   PetscFunctionReturn(0);
3087 }
3088 
3089 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3090 {
3091   PetscErrorCode     ierr;
3092   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3093   Mat_SeqAIJCUSPARSE *cy;
3094   Mat_SeqAIJCUSPARSE *cx;
3095   PetscScalar        *ay;
3096   const PetscScalar  *ax;
3097   CsrMatrix          *csry,*csrx;
3098 
3099   PetscFunctionBegin;
3100   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3101   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3102   if (X->ops->axpy != Y->ops->axpy) {
3103     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3104     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3105     PetscFunctionReturn(0);
3106   }
3107   /* if we are here, it means both matrices are bound to GPU */
3108   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3109   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3110   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3111   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3112   csry = (CsrMatrix*)cy->mat->mat;
3113   csrx = (CsrMatrix*)cx->mat->mat;
3114   /* see if we can turn this into a cublas axpy */
3115   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3116     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3117     if (eq) {
3118       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3119     }
3120     if (eq) str = SAME_NONZERO_PATTERN;
3121   }
3122   /* spgeam is buggy with one column */
3123   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3124 
3125   if (str == SUBSET_NONZERO_PATTERN) {
3126     cusparseStatus_t stat;
3127     PetscScalar      b = 1.0;
3128 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3129     size_t           bufferSize;
3130     void             *buffer;
3131     cudaError_t      cerr;
3132 #endif
3133 
3134     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3135     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3136     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3137 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3138     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3139                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3140                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3141                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3142     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3143     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3144     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3145                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3146                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3147                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3148     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3149     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3150     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3151 #else
3152     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3153     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3154                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3155                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3156                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3157     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3158     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3159 #endif
3160     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3161     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3162     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3163     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3164   } else if (str == SAME_NONZERO_PATTERN) {
3165     cublasHandle_t cublasv2handle;
3166     cublasStatus_t berr;
3167     PetscBLASInt   one = 1, bnz = 1;
3168 
3169     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3170     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3171     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3172     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3173     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3174     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3175     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3176     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3177     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3178     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3179     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3180   } else {
3181     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3182     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3183   }
3184   PetscFunctionReturn(0);
3185 }
3186 
3187 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3188 {
3189   PetscErrorCode ierr;
3190   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3191   PetscScalar    *ay;
3192   cublasHandle_t cublasv2handle;
3193   cublasStatus_t berr;
3194   PetscBLASInt   one = 1, bnz = 1;
3195 
3196   PetscFunctionBegin;
3197   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3198   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3199   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3200   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3201   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3202   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3203   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3204   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3205   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3206   PetscFunctionReturn(0);
3207 }
3208 
3209 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3210 {
3211   PetscErrorCode ierr;
3212   PetscBool      both = PETSC_FALSE;
3213   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3214 
3215   PetscFunctionBegin;
3216   if (A->factortype == MAT_FACTOR_NONE) {
3217     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3218     if (spptr->mat) {
3219       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3220       if (matrix->values) {
3221         both = PETSC_TRUE;
3222         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3223       }
3224     }
3225     if (spptr->matTranspose) {
3226       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3227       if (matrix->values) {
3228         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3229       }
3230     }
3231   }
3232   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3233   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3234   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3235   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3236   else A->offloadmask = PETSC_OFFLOAD_CPU;
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3241 {
3242   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3243   PetscErrorCode ierr;
3244 
3245   PetscFunctionBegin;
3246   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3247   if (flg) {
3248     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3249 
3250     A->ops->scale                     = MatScale_SeqAIJ;
3251     A->ops->axpy                      = MatAXPY_SeqAIJ;
3252     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3253     A->ops->mult                      = MatMult_SeqAIJ;
3254     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3255     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3256     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3257     A->ops->multhermitiantranspose    = NULL;
3258     A->ops->multhermitiantransposeadd = NULL;
3259     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3260     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3261     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3262     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3263     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3264     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3265     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3266     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3267   } else {
3268     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3269     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3270     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3271     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3272     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3273     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3274     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3275     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3276     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3277     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3278     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3279     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3280     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3281     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3283     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3284     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3285   }
3286   A->boundtocpu = flg;
3287   a->inode.use = flg;
3288   PetscFunctionReturn(0);
3289 }
3290 
3291 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3292 {
3293   PetscErrorCode   ierr;
3294   cusparseStatus_t stat;
3295   Mat              B;
3296 
3297   PetscFunctionBegin;
3298   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3299   if (reuse == MAT_INITIAL_MATRIX) {
3300     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3301   } else if (reuse == MAT_REUSE_MATRIX) {
3302     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3303   }
3304   B = *newmat;
3305 
3306   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3307   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3308 
3309   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3310     if (B->factortype == MAT_FACTOR_NONE) {
3311       Mat_SeqAIJCUSPARSE *spptr;
3312       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3313       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3314       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3315       spptr->format     = MAT_CUSPARSE_CSR;
3316      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3317       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3318       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3319       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3320      #endif
3321       B->spptr = spptr;
3322     } else {
3323       Mat_SeqAIJCUSPARSETriFactors *spptr;
3324 
3325       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3326       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3327       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3328       B->spptr = spptr;
3329     }
3330     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3331   }
3332   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3333   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3334   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3335   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3336   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3337   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3338 
3339   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3340   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3341   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3342   PetscFunctionReturn(0);
3343 }
3344 
3345 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3346 {
3347   PetscErrorCode ierr;
3348 
3349   PetscFunctionBegin;
3350   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3351   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3352   PetscFunctionReturn(0);
3353 }
3354 
3355 /*MC
3356    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3357 
3358    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3359    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3360    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3361 
3362    Options Database Keys:
3363 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3364 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3365 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3366 
3367   Level: beginner
3368 
3369 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3370 M*/
3371 
3372 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3373 
3374 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3375 {
3376   PetscErrorCode ierr;
3377 
3378   PetscFunctionBegin;
3379   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3380   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3381   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3382   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3383   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3384 
3385   PetscFunctionReturn(0);
3386 }
3387 
3388 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3389 {
3390   PetscErrorCode   ierr;
3391   cusparseStatus_t stat;
3392 
3393   PetscFunctionBegin;
3394   if (*cusparsestruct) {
3395     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3396     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3397     delete (*cusparsestruct)->workVector;
3398     delete (*cusparsestruct)->rowoffsets_gpu;
3399     delete (*cusparsestruct)->cooPerm;
3400     delete (*cusparsestruct)->cooPerm_a;
3401     delete (*cusparsestruct)->csr2csc_i;
3402     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3403     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3404   }
3405   PetscFunctionReturn(0);
3406 }
3407 
3408 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3409 {
3410   PetscFunctionBegin;
3411   if (*mat) {
3412     delete (*mat)->values;
3413     delete (*mat)->column_indices;
3414     delete (*mat)->row_offsets;
3415     delete *mat;
3416     *mat = 0;
3417   }
3418   PetscFunctionReturn(0);
3419 }
3420 
3421 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3422 {
3423   cusparseStatus_t stat;
3424   PetscErrorCode   ierr;
3425 
3426   PetscFunctionBegin;
3427   if (*trifactor) {
3428     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3429     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3430     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3431     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3432     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3433    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3434     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3435    #endif
3436     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3437   }
3438   PetscFunctionReturn(0);
3439 }
3440 
3441 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3442 {
3443   CsrMatrix        *mat;
3444   cusparseStatus_t stat;
3445   cudaError_t      err;
3446 
3447   PetscFunctionBegin;
3448   if (*matstruct) {
3449     if ((*matstruct)->mat) {
3450       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3451        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3452         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3453        #else
3454         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3455         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3456        #endif
3457       } else {
3458         mat = (CsrMatrix*)(*matstruct)->mat;
3459         CsrMatrix_Destroy(&mat);
3460       }
3461     }
3462     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3463     delete (*matstruct)->cprowIndices;
3464     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3465     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3466     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3467 
3468    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3469     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3470     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3471     for (int i=0; i<3; i++) {
3472       if (mdata->cuSpMV[i].initialized) {
3473         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3474         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3475         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3476       }
3477     }
3478    #endif
3479     delete *matstruct;
3480     *matstruct = NULL;
3481   }
3482   PetscFunctionReturn(0);
3483 }
3484 
3485 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3486 {
3487   PetscErrorCode ierr;
3488 
3489   PetscFunctionBegin;
3490   if (*trifactors) {
3491     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3492     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3493     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3494     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3495     delete (*trifactors)->rpermIndices;
3496     delete (*trifactors)->cpermIndices;
3497     delete (*trifactors)->workVector;
3498     (*trifactors)->rpermIndices = NULL;
3499     (*trifactors)->cpermIndices = NULL;
3500     (*trifactors)->workVector = NULL;
3501     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3502     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3503     (*trifactors)->init_dev_prop = PETSC_FALSE;
3504   }
3505   PetscFunctionReturn(0);
3506 }
3507 
3508 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3509 {
3510   PetscErrorCode   ierr;
3511   cusparseHandle_t handle;
3512   cusparseStatus_t stat;
3513 
3514   PetscFunctionBegin;
3515   if (*trifactors) {
3516     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3517     if (handle = (*trifactors)->handle) {
3518       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3519     }
3520     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3521   }
3522   PetscFunctionReturn(0);
3523 }
3524 
3525 struct IJCompare
3526 {
3527   __host__ __device__
3528   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3529   {
3530     if (t1.get<0>() < t2.get<0>()) return true;
3531     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3532     return false;
3533   }
3534 };
3535 
3536 struct IJEqual
3537 {
3538   __host__ __device__
3539   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3540   {
3541     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3542     return true;
3543   }
3544 };
3545 
3546 struct IJDiff
3547 {
3548   __host__ __device__
3549   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3550   {
3551     return t1 == t2 ? 0 : 1;
3552   }
3553 };
3554 
3555 struct IJSum
3556 {
3557   __host__ __device__
3558   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3559   {
3560     return t1||t2;
3561   }
3562 };
3563 
3564 #include <thrust/iterator/discard_iterator.h>
3565 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3566 {
3567   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3568   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3569   THRUSTARRAY                           *cooPerm_v = NULL;
3570   thrust::device_ptr<const PetscScalar> d_v;
3571   CsrMatrix                             *matrix;
3572   PetscErrorCode                        ierr;
3573   PetscInt                              n;
3574 
3575   PetscFunctionBegin;
3576   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3577   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3578   if (!cusp->cooPerm) {
3579     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3580     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3581     PetscFunctionReturn(0);
3582   }
3583   matrix = (CsrMatrix*)cusp->mat->mat;
3584   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3585   if (!v) {
3586     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3587     goto finalize;
3588   }
3589   n = cusp->cooPerm->size();
3590   if (isCudaMem(v)) {
3591     d_v = thrust::device_pointer_cast(v);
3592   } else {
3593     cooPerm_v = new THRUSTARRAY(n);
3594     cooPerm_v->assign(v,v+n);
3595     d_v = cooPerm_v->data();
3596     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3597   }
3598   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3599   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3600     if (cusp->cooPerm_a) {
3601       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3602       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3603       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3604       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3605       delete cooPerm_w;
3606     } else {
3607       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3608                                                                 matrix->values->begin()));
3609       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3610                                                                 matrix->values->end()));
3611       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3612     }
3613   } else {
3614     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3615       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3616       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3617     } else {
3618       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3619                                                                 matrix->values->begin()));
3620       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3621                                                                 matrix->values->end()));
3622       thrust::for_each(zibit,zieit,VecCUDAEquals());
3623     }
3624   }
3625   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3626 finalize:
3627   delete cooPerm_v;
3628   A->offloadmask = PETSC_OFFLOAD_GPU;
3629   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3630   /* shorter version of MatAssemblyEnd_SeqAIJ */
3631   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3632   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3633   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3634   a->reallocs         = 0;
3635   A->info.mallocs    += 0;
3636   A->info.nz_unneeded = 0;
3637   A->assembled = A->was_assembled = PETSC_TRUE;
3638   A->num_ass++;
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3643 {
3644   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3645   PetscErrorCode     ierr;
3646 
3647   PetscFunctionBegin;
3648   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3649   if (!cusp) PetscFunctionReturn(0);
3650   if (destroy) {
3651     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3652     delete cusp->csr2csc_i;
3653     cusp->csr2csc_i = NULL;
3654   }
3655   A->transupdated = PETSC_FALSE;
3656   PetscFunctionReturn(0);
3657 }
3658 
3659 #include <thrust/binary_search.h>
3660 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3661 {
3662   PetscErrorCode     ierr;
3663   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3664   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3665   PetscInt           cooPerm_n, nzr = 0;
3666   cudaError_t        cerr;
3667 
3668   PetscFunctionBegin;
3669   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3670   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3671   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3672   if (n != cooPerm_n) {
3673     delete cusp->cooPerm;
3674     delete cusp->cooPerm_a;
3675     cusp->cooPerm = NULL;
3676     cusp->cooPerm_a = NULL;
3677   }
3678   if (n) {
3679     THRUSTINTARRAY d_i(n);
3680     THRUSTINTARRAY d_j(n);
3681     THRUSTINTARRAY ii(A->rmap->n);
3682 
3683     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3684     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3685 
3686     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3687     d_i.assign(coo_i,coo_i+n);
3688     d_j.assign(coo_j,coo_j+n);
3689     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3690     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3691 
3692     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3693     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3694     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3695     *cusp->cooPerm_a = d_i;
3696     THRUSTINTARRAY w = d_j;
3697 
3698     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3699     if (nekey == ekey) { /* all entries are unique */
3700       delete cusp->cooPerm_a;
3701       cusp->cooPerm_a = NULL;
3702     } else { /* I couldn't come up with a more elegant algorithm */
3703       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3704       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3705       (*cusp->cooPerm_a)[0] = 0;
3706       w[0] = 0;
3707       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3708       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3709     }
3710     thrust::counting_iterator<PetscInt> search_begin(0);
3711     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3712                         search_begin, search_begin + A->rmap->n,
3713                         ii.begin());
3714     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3715 
3716     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3717     a->singlemalloc = PETSC_FALSE;
3718     a->free_a       = PETSC_TRUE;
3719     a->free_ij      = PETSC_TRUE;
3720     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3721     a->i[0] = 0;
3722     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3723     a->nz = a->maxnz = a->i[A->rmap->n];
3724     a->rmax = 0;
3725     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3726     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3727     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3728     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3729     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3730     for (PetscInt i = 0; i < A->rmap->n; i++) {
3731       const PetscInt nnzr = a->i[i+1] - a->i[i];
3732       nzr += (PetscInt)!!(nnzr);
3733       a->ilen[i] = a->imax[i] = nnzr;
3734       a->rmax = PetscMax(a->rmax,nnzr);
3735     }
3736     a->nonzerorowcnt = nzr;
3737     A->preallocated = PETSC_TRUE;
3738     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3739     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3740   } else {
3741     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3742   }
3743   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3744 
3745   /* We want to allocate the CUSPARSE struct for matvec now.
3746      The code is so convoluted now that I prefer to copy zeros */
3747   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3748   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3749   A->offloadmask = PETSC_OFFLOAD_CPU;
3750   A->nonzerostate++;
3751   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3752   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3753 
3754   A->assembled = PETSC_FALSE;
3755   A->was_assembled = PETSC_FALSE;
3756   PetscFunctionReturn(0);
3757 }
3758 
3759 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3760 {
3761   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3762   CsrMatrix          *csr;
3763   PetscErrorCode     ierr;
3764 
3765   PetscFunctionBegin;
3766   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3767   PetscValidPointer(a,2);
3768   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3769   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3770   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3771   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3772   csr = (CsrMatrix*)cusp->mat->mat;
3773   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3774   *a = csr->values->data().get();
3775   PetscFunctionReturn(0);
3776 }
3777 
3778 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3779 {
3780   PetscFunctionBegin;
3781   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3782   PetscValidPointer(a,2);
3783   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3784   *a = NULL;
3785   PetscFunctionReturn(0);
3786 }
3787 
3788 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3789 {
3790   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3791   CsrMatrix          *csr;
3792   PetscErrorCode     ierr;
3793 
3794   PetscFunctionBegin;
3795   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3796   PetscValidPointer(a,2);
3797   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3798   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3799   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3800   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3801   csr = (CsrMatrix*)cusp->mat->mat;
3802   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3803   *a = csr->values->data().get();
3804   A->offloadmask = PETSC_OFFLOAD_GPU;
3805   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3806   PetscFunctionReturn(0);
3807 }
3808 
3809 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3810 {
3811   PetscErrorCode ierr;
3812 
3813   PetscFunctionBegin;
3814   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3815   PetscValidPointer(a,2);
3816   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3817   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3818   *a = NULL;
3819   PetscFunctionReturn(0);
3820 }
3821 
3822 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3823 {
3824   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3825   CsrMatrix          *csr;
3826   PetscErrorCode     ierr;
3827 
3828   PetscFunctionBegin;
3829   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3830   PetscValidPointer(a,2);
3831   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3832   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3833   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3834   csr = (CsrMatrix*)cusp->mat->mat;
3835   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3836   *a = csr->values->data().get();
3837   A->offloadmask = PETSC_OFFLOAD_GPU;
3838   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3839   PetscFunctionReturn(0);
3840 }
3841 
3842 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3843 {
3844   PetscErrorCode ierr;
3845 
3846   PetscFunctionBegin;
3847   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3848   PetscValidPointer(a,2);
3849   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3850   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3851   *a = NULL;
3852   PetscFunctionReturn(0);
3853 }
3854 
3855 struct IJCompare4
3856 {
3857   __host__ __device__
3858   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3859   {
3860     if (t1.get<0>() < t2.get<0>()) return true;
3861     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3862     return false;
3863   }
3864 };
3865 
3866 struct Shift
3867 {
3868   int _shift;
3869 
3870   Shift(int shift) : _shift(shift) {}
3871   __host__ __device__
3872   inline int operator() (const int &c)
3873   {
3874     return c + _shift;
3875   }
3876 };
3877 
3878 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3879 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3880 {
3881   PetscErrorCode               ierr;
3882   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3883   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3884   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3885   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3886   PetscInt                     Annz,Bnnz;
3887   cusparseStatus_t             stat;
3888   PetscInt                     i,m,n,zero = 0;
3889   cudaError_t                  cerr;
3890 
3891   PetscFunctionBegin;
3892   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3893   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3894   PetscValidPointer(C,4);
3895   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3896   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3897   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3898   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3899   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3900   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3901   if (reuse == MAT_INITIAL_MATRIX) {
3902     m     = A->rmap->n;
3903     n     = A->cmap->n + B->cmap->n;
3904     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3905     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3906     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3907     c     = (Mat_SeqAIJ*)(*C)->data;
3908     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3909     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3910     Ccsr  = new CsrMatrix;
3911     Cmat->cprowIndices      = NULL;
3912     c->compressedrow.use    = PETSC_FALSE;
3913     c->compressedrow.nrows  = 0;
3914     c->compressedrow.i      = NULL;
3915     c->compressedrow.rindex = NULL;
3916     Ccusp->workVector       = NULL;
3917     Ccusp->nrows    = m;
3918     Ccusp->mat      = Cmat;
3919     Ccusp->mat->mat = Ccsr;
3920     Ccsr->num_rows  = m;
3921     Ccsr->num_cols  = n;
3922     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3923     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3924     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3925     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3926     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3927     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3928     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3929     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3930     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3931     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3932     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3933     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3934     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3935     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3936     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3937 
3938     Acsr = (CsrMatrix*)Acusp->mat->mat;
3939     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3940     Annz = (PetscInt)Acsr->column_indices->size();
3941     Bnnz = (PetscInt)Bcsr->column_indices->size();
3942     c->nz = Annz + Bnnz;
3943     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3944     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3945     Ccsr->values = new THRUSTARRAY(c->nz);
3946     Ccsr->num_entries = c->nz;
3947     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3948     if (c->nz) {
3949       auto Acoo = new THRUSTINTARRAY32(Annz);
3950       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3951       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3952       THRUSTINTARRAY32 *Aroff,*Broff;
3953 
3954       if (a->compressedrow.use) { /* need full row offset */
3955         if (!Acusp->rowoffsets_gpu) {
3956           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3957           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3958           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3959         }
3960         Aroff = Acusp->rowoffsets_gpu;
3961       } else Aroff = Acsr->row_offsets;
3962       if (b->compressedrow.use) { /* need full row offset */
3963         if (!Bcusp->rowoffsets_gpu) {
3964           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3965           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3966           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3967         }
3968         Broff = Bcusp->rowoffsets_gpu;
3969       } else Broff = Bcsr->row_offsets;
3970       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3971       stat = cusparseXcsr2coo(Acusp->handle,
3972                               Aroff->data().get(),
3973                               Annz,
3974                               m,
3975                               Acoo->data().get(),
3976                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3977       stat = cusparseXcsr2coo(Bcusp->handle,
3978                               Broff->data().get(),
3979                               Bnnz,
3980                               m,
3981                               Bcoo->data().get(),
3982                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3984       auto Aperm = thrust::make_constant_iterator(1);
3985       auto Bperm = thrust::make_constant_iterator(0);
3986 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3987       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3988       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3989 #else
3990       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3991       auto Bcib = Bcsr->column_indices->begin();
3992       auto Bcie = Bcsr->column_indices->end();
3993       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3994 #endif
3995       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3996       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3997       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3998       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3999       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4000       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4001       auto p1 = Ccusp->cooPerm->begin();
4002       auto p2 = Ccusp->cooPerm->begin();
4003       thrust::advance(p2,Annz);
4004       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4005 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4006       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4007 #endif
4008       auto cci = thrust::make_counting_iterator(zero);
4009       auto cce = thrust::make_counting_iterator(c->nz);
4010 #if 0 //Errors on SUMMIT cuda 11.1.0
4011       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4012 #else
4013       auto pred = thrust::identity<int>();
4014       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4015       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4016 #endif
4017       stat = cusparseXcoo2csr(Ccusp->handle,
4018                               Ccoo->data().get(),
4019                               c->nz,
4020                               m,
4021                               Ccsr->row_offsets->data().get(),
4022                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4023       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4024       delete wPerm;
4025       delete Acoo;
4026       delete Bcoo;
4027       delete Ccoo;
4028 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4029       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4030                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4031                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4032                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4033 #endif
4034       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4035         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4036         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4037         CsrMatrix *CcsrT = new CsrMatrix;
4038         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4039         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4040 
4041         (*C)->form_explicit_transpose = PETSC_TRUE;
4042         (*C)->transupdated = PETSC_TRUE;
4043         Ccusp->rowoffsets_gpu = NULL;
4044         CmatT->cprowIndices = NULL;
4045         CmatT->mat = CcsrT;
4046         CcsrT->num_rows = n;
4047         CcsrT->num_cols = m;
4048         CcsrT->num_entries = c->nz;
4049 
4050         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4051         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4052         CcsrT->values = new THRUSTARRAY(c->nz);
4053 
4054         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4055         auto rT = CcsrT->row_offsets->begin();
4056         if (AT) {
4057           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4058           thrust::advance(rT,-1);
4059         }
4060         if (BT) {
4061           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4062           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4063           thrust::copy(titb,tite,rT);
4064         }
4065         auto cT = CcsrT->column_indices->begin();
4066         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4067         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4068         auto vT = CcsrT->values->begin();
4069         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4070         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4071         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4072 
4073         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4074         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4075         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4076         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4077         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4078         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4079         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4080         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4081         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4082 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4083         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4084                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4085                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4086                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4087 #endif
4088         Ccusp->matTranspose = CmatT;
4089       }
4090     }
4091 
4092     c->singlemalloc = PETSC_FALSE;
4093     c->free_a       = PETSC_TRUE;
4094     c->free_ij      = PETSC_TRUE;
4095     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4096     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4097     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4098       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4099       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4100       ii   = *Ccsr->row_offsets;
4101       jj   = *Ccsr->column_indices;
4102       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4103       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4104     } else {
4105       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4106       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4107     }
4108     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4109     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4110     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4111     c->maxnz = c->nz;
4112     c->nonzerorowcnt = 0;
4113     c->rmax = 0;
4114     for (i = 0; i < m; i++) {
4115       const PetscInt nn = c->i[i+1] - c->i[i];
4116       c->ilen[i] = c->imax[i] = nn;
4117       c->nonzerorowcnt += (PetscInt)!!nn;
4118       c->rmax = PetscMax(c->rmax,nn);
4119     }
4120     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4121     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4122     (*C)->nonzerostate++;
4123     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4124     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4125     Ccusp->nonzerostate = (*C)->nonzerostate;
4126     (*C)->preallocated  = PETSC_TRUE;
4127   } else {
4128     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4129     c = (Mat_SeqAIJ*)(*C)->data;
4130     if (c->nz) {
4131       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4132       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4133       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4134       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4135       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4136       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4137       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4138       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4139       Acsr = (CsrMatrix*)Acusp->mat->mat;
4140       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4141       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4142       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4143       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4144       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4145       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4146       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4147       auto pmid = Ccusp->cooPerm->begin();
4148       thrust::advance(pmid,Acsr->num_entries);
4149       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4150       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4151                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4152       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4153                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4154       thrust::for_each(zibait,zieait,VecCUDAEquals());
4155       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4156                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4157       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4158                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4159       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4160       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4161       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4162         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4163         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4164         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4165         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4166         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4167         auto vT = CcsrT->values->begin();
4168         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4169         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4170         (*C)->transupdated = PETSC_TRUE;
4171       }
4172       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4173     }
4174   }
4175   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4176   (*C)->assembled     = PETSC_TRUE;
4177   (*C)->was_assembled = PETSC_FALSE;
4178   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4179   PetscFunctionReturn(0);
4180 }
4181 
4182 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4183 {
4184   PetscErrorCode    ierr;
4185   bool              dmem;
4186   const PetscScalar *av;
4187   cudaError_t       cerr;
4188 
4189   PetscFunctionBegin;
4190   dmem = isCudaMem(v);
4191   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4192   if (n && idx) {
4193     THRUSTINTARRAY widx(n);
4194     widx.assign(idx,idx+n);
4195     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4196 
4197     THRUSTARRAY *w = NULL;
4198     thrust::device_ptr<PetscScalar> dv;
4199     if (dmem) {
4200       dv = thrust::device_pointer_cast(v);
4201     } else {
4202       w = new THRUSTARRAY(n);
4203       dv = w->data();
4204     }
4205     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4206 
4207     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4208     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4209     thrust::for_each(zibit,zieit,VecCUDAEquals());
4210     if (w) {
4211       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4212     }
4213     delete w;
4214   } else {
4215     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4216   }
4217   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4218   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4219   PetscFunctionReturn(0);
4220 }
4221