xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d49cd2b75844d238472ed9de8ddd2df14c289b9f)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
7 
8 #include <petscconf.h>
9 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10 #include <../src/mat/impls/sbaij/seq/sbaij.h>
11 #include <../src/vec/vec/impls/dvecimpl.h>
12 #include <petsc/private/vecimpl.h>
13 #undef VecType
14 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15 #include <thrust/async/for_each.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
164 
165   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
166     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
167     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
168     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
170     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
171     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
175     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
176     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
177   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178 
179   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
180   (*B)->canuseordering = PETSC_TRUE;
181   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
182   PetscFunctionReturn(0);
183 }
184 
185 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186 {
187   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
188 
189   PetscFunctionBegin;
190   switch (op) {
191   case MAT_CUSPARSE_MULT:
192     cusparsestruct->format = format;
193     break;
194   case MAT_CUSPARSE_ALL:
195     cusparsestruct->format = format;
196     break;
197   default:
198     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199   }
200   PetscFunctionReturn(0);
201 }
202 
203 /*@
204    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205    operation. Only the MatMult operation can use different GPU storage formats
206    for MPIAIJCUSPARSE matrices.
207    Not Collective
208 
209    Input Parameters:
210 +  A - Matrix of type SEQAIJCUSPARSE
211 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
212 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213 
214    Output Parameter:
215 
216    Level: intermediate
217 
218 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219 @*/
220 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221 {
222   PetscErrorCode ierr;
223 
224   PetscFunctionBegin;
225   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231 {
232   PetscErrorCode ierr;
233 
234   PetscFunctionBegin;
235   switch (op) {
236     case MAT_FORM_EXPLICIT_TRANSPOSE:
237       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
238       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
239       A->form_explicit_transpose = flg;
240       break;
241     default:
242       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
243       break;
244   }
245   PetscFunctionReturn(0);
246 }
247 
248 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249 
250 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251 {
252   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253   IS             isrow = b->row,iscol = b->col;
254   PetscBool      row_identity,col_identity;
255   PetscErrorCode ierr;
256 
257   PetscFunctionBegin;
258   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260   B->offloadmask = PETSC_OFFLOAD_CPU;
261   /* determine which version of MatSolve needs to be used. */
262   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264   if (row_identity && col_identity) {
265     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267     B->ops->matsolve = NULL;
268     B->ops->matsolvetranspose = NULL;
269   } else {
270     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272     B->ops->matsolve = NULL;
273     B->ops->matsolvetranspose = NULL;
274   }
275 
276   /* get the triangular factors */
277   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278   PetscFunctionReturn(0);
279 }
280 
281 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
282 {
283   PetscErrorCode           ierr;
284   MatCUSPARSEStorageFormat format;
285   PetscBool                flg;
286   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
287 
288   PetscFunctionBegin;
289   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
290   if (A->factortype == MAT_FACTOR_NONE) {
291     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294 
295     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303 
304     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307 
308     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311    #endif
312   }
313   ierr = PetscOptionsTail();CHKERRQ(ierr);
314   PetscFunctionReturn(0);
315 }
316 
317 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
318 {
319   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320   PetscErrorCode               ierr;
321 
322   PetscFunctionBegin;
323   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
325   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
326   PetscFunctionReturn(0);
327 }
328 
329 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
330 {
331   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
332   PetscErrorCode               ierr;
333 
334   PetscFunctionBegin;
335   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
336   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
337   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
338   PetscFunctionReturn(0);
339 }
340 
341 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342 {
343   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344   PetscErrorCode               ierr;
345 
346   PetscFunctionBegin;
347   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350   PetscFunctionReturn(0);
351 }
352 
353 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354 {
355   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356   PetscErrorCode               ierr;
357 
358   PetscFunctionBegin;
359   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362   PetscFunctionReturn(0);
363 }
364 
365 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
366 {
367   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
368   PetscInt                          n = A->rmap->n;
369   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
371   cusparseStatus_t                  stat;
372   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
373   const MatScalar                   *aa = a->a,*v;
374   PetscInt                          *AiLo, *AjLo;
375   PetscInt                          i,nz, nzLower, offset, rowOffset;
376   PetscErrorCode                    ierr;
377   cudaError_t                       cerr;
378 
379   PetscFunctionBegin;
380   if (!n) PetscFunctionReturn(0);
381   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
382     try {
383       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
384       nzLower=n+ai[n]-ai[1];
385       if (!loTriFactor) {
386         PetscScalar                       *AALo;
387 
388         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
389 
390         /* Allocate Space for the lower triangular matrix */
391         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
392         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
393 
394         /* Fill the lower triangular matrix */
395         AiLo[0]  = (PetscInt) 0;
396         AiLo[n]  = nzLower;
397         AjLo[0]  = (PetscInt) 0;
398         AALo[0]  = (MatScalar) 1.0;
399         v        = aa;
400         vi       = aj;
401         offset   = 1;
402         rowOffset= 1;
403         for (i=1; i<n; i++) {
404           nz = ai[i+1] - ai[i];
405           /* additional 1 for the term on the diagonal */
406           AiLo[i]    = rowOffset;
407           rowOffset += nz+1;
408 
409           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
411 
412           offset      += nz;
413           AjLo[offset] = (PetscInt) i;
414           AALo[offset] = (MatScalar) 1.0;
415           offset      += 1;
416 
417           v  += nz;
418           vi += nz;
419         }
420 
421         /* allocate space for the triangular factor information */
422         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424         /* Create the matrix description */
425         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
426         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
427        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429        #else
430         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431        #endif
432         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
433         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434 
435         /* set the operation */
436         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437 
438         /* set the matrix */
439         loTriFactor->csrMat = new CsrMatrix;
440         loTriFactor->csrMat->num_rows = n;
441         loTriFactor->csrMat->num_cols = n;
442         loTriFactor->csrMat->num_entries = nzLower;
443 
444         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446 
447         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449 
450         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452 
453         /* Create the solve analysis information */
454         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
456       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463       #endif
464 
465         /* perform the solve analysis */
466         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469                                  loTriFactor->csrMat->column_indices->data().get(),
470                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471                                  loTriFactor->solveInfo,
472                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473                                #else
474                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475                                #endif
476         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478 
479         /* assign the pointer */
480         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
481         loTriFactor->AA_h = AALo;
482         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
483         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
484         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485       } else { /* update values only */
486         if (!loTriFactor->AA_h) {
487           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
488         }
489         /* Fill the lower triangular matrix */
490         loTriFactor->AA_h[0]  = 1.0;
491         v        = aa;
492         vi       = aj;
493         offset   = 1;
494         for (i=1; i<n; i++) {
495           nz = ai[i+1] - ai[i];
496           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497           offset      += nz;
498           loTriFactor->AA_h[offset] = 1.0;
499           offset      += 1;
500           v  += nz;
501         }
502         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504       }
505     } catch(char *ex) {
506       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
507     }
508   }
509   PetscFunctionReturn(0);
510 }
511 
512 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
513 {
514   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
515   PetscInt                          n = A->rmap->n;
516   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
518   cusparseStatus_t                  stat;
519   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
520   const MatScalar                   *aa = a->a,*v;
521   PetscInt                          *AiUp, *AjUp;
522   PetscInt                          i,nz, nzUpper, offset;
523   PetscErrorCode                    ierr;
524   cudaError_t                       cerr;
525 
526   PetscFunctionBegin;
527   if (!n) PetscFunctionReturn(0);
528   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
529     try {
530       /* next, figure out the number of nonzeros in the upper triangular matrix. */
531       nzUpper = adiag[0]-adiag[n];
532       if (!upTriFactor) {
533         PetscScalar *AAUp;
534 
535         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
536 
537         /* Allocate Space for the upper triangular matrix */
538         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
539         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
540 
541         /* Fill the upper triangular matrix */
542         AiUp[0]=(PetscInt) 0;
543         AiUp[n]=nzUpper;
544         offset = nzUpper;
545         for (i=n-1; i>=0; i--) {
546           v  = aa + adiag[i+1] + 1;
547           vi = aj + adiag[i+1] + 1;
548 
549           /* number of elements NOT on the diagonal */
550           nz = adiag[i] - adiag[i+1]-1;
551 
552           /* decrement the offset */
553           offset -= (nz+1);
554 
555           /* first, set the diagonal elements */
556           AjUp[offset] = (PetscInt) i;
557           AAUp[offset] = (MatScalar)1./v[nz];
558           AiUp[i]      = AiUp[i+1] - (nz+1);
559 
560           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
562         }
563 
564         /* allocate space for the triangular factor information */
565         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
567 
568         /* Create the matrix description */
569         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
570         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
571        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573        #else
574         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575        #endif
576         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
577         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578 
579         /* set the operation */
580         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581 
582         /* set the matrix */
583         upTriFactor->csrMat = new CsrMatrix;
584         upTriFactor->csrMat->num_rows = n;
585         upTriFactor->csrMat->num_cols = n;
586         upTriFactor->csrMat->num_entries = nzUpper;
587 
588         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590 
591         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593 
594         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596 
597         /* Create the solve analysis information */
598         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
600       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607       #endif
608 
609         /* perform the solve analysis */
610         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613                                  upTriFactor->csrMat->column_indices->data().get(),
614                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615                                  upTriFactor->solveInfo,
616                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617                                #else
618                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619                                #endif
620         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622 
623         /* assign the pointer */
624         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
625         upTriFactor->AA_h = AAUp;
626         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
627         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
628         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629       } else {
630         if (!upTriFactor->AA_h) {
631           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
632         }
633         /* Fill the upper triangular matrix */
634         offset = nzUpper;
635         for (i=n-1; i>=0; i--) {
636           v  = aa + adiag[i+1] + 1;
637 
638           /* number of elements NOT on the diagonal */
639           nz = adiag[i] - adiag[i+1]-1;
640 
641           /* decrement the offset */
642           offset -= (nz+1);
643 
644           /* first, set the diagonal elements */
645           upTriFactor->AA_h[offset] = 1./v[nz];
646           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647         }
648         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650       }
651     } catch(char *ex) {
652       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
653     }
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
659 {
660   PetscErrorCode               ierr;
661   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
662   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
663   IS                           isrow = a->row,iscol = a->icol;
664   PetscBool                    row_identity,col_identity;
665   PetscInt                     n = A->rmap->n;
666 
667   PetscFunctionBegin;
668   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
671 
672   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673   cusparseTriFactors->nnz=a->nz;
674 
675   A->offloadmask = PETSC_OFFLOAD_BOTH;
676   /* lower triangular indices */
677   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679     const PetscInt *r;
680 
681     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683     cusparseTriFactors->rpermIndices->assign(r, r+n);
684     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686   }
687 
688   /* upper triangular indices */
689   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691     const PetscInt *c;
692 
693     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695     cusparseTriFactors->cpermIndices->assign(c, c+n);
696     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698   }
699   PetscFunctionReturn(0);
700 }
701 
702 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703 {
704   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708   cusparseStatus_t                  stat;
709   PetscErrorCode                    ierr;
710   cudaError_t                       cerr;
711   PetscInt                          *AiUp, *AjUp;
712   PetscScalar                       *AAUp;
713   PetscScalar                       *AALo;
714   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717   const MatScalar                   *aa = b->a,*v;
718 
719   PetscFunctionBegin;
720   if (!n) PetscFunctionReturn(0);
721   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722     try {
723       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725       if (!upTriFactor && !loTriFactor) {
726         /* Allocate Space for the upper triangular matrix */
727         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
728         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729 
730         /* Fill the upper triangular matrix */
731         AiUp[0]=(PetscInt) 0;
732         AiUp[n]=nzUpper;
733         offset = 0;
734         for (i=0; i<n; i++) {
735           /* set the pointers */
736           v  = aa + ai[i];
737           vj = aj + ai[i];
738           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739 
740           /* first, set the diagonal elements */
741           AjUp[offset] = (PetscInt) i;
742           AAUp[offset] = (MatScalar)1.0/v[nz];
743           AiUp[i]      = offset;
744           AALo[offset] = (MatScalar)1.0/v[nz];
745 
746           offset+=1;
747           if (nz>0) {
748             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750             for (j=offset; j<offset+nz; j++) {
751               AAUp[j] = -AAUp[j];
752               AALo[j] = AAUp[j]/v[nz];
753             }
754             offset+=nz;
755           }
756         }
757 
758         /* allocate space for the triangular factor information */
759         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761 
762         /* Create the matrix description */
763         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
764         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
765        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767        #else
768         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769        #endif
770         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
771         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772 
773         /* set the matrix */
774         upTriFactor->csrMat = new CsrMatrix;
775         upTriFactor->csrMat->num_rows = A->rmap->n;
776         upTriFactor->csrMat->num_cols = A->cmap->n;
777         upTriFactor->csrMat->num_entries = a->nz;
778 
779         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781 
782         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784 
785         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787 
788         /* set the operation */
789         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790 
791         /* Create the solve analysis information */
792         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
794       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801       #endif
802 
803         /* perform the solve analysis */
804         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807                                  upTriFactor->csrMat->column_indices->data().get(),
808                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809                                  upTriFactor->solveInfo,
810                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811                                 #else
812                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813                                 #endif
814         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816 
817         /* assign the pointer */
818         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819 
820         /* allocate space for the triangular factor information */
821         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823 
824         /* Create the matrix description */
825         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
826         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
827        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829        #else
830         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831        #endif
832         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
833         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834 
835         /* set the operation */
836         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837 
838         /* set the matrix */
839         loTriFactor->csrMat = new CsrMatrix;
840         loTriFactor->csrMat->num_rows = A->rmap->n;
841         loTriFactor->csrMat->num_cols = A->cmap->n;
842         loTriFactor->csrMat->num_entries = a->nz;
843 
844         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846 
847         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849 
850         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852 
853         /* Create the solve analysis information */
854         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
856       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863       #endif
864 
865         /* perform the solve analysis */
866         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869                                  loTriFactor->csrMat->column_indices->data().get(),
870                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871                                  loTriFactor->solveInfo,
872                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873                                 #else
874                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875                                 #endif
876         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878 
879         /* assign the pointer */
880         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881 
882         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
883         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
884         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885       } else {
886         /* Fill the upper triangular matrix */
887         offset = 0;
888         for (i=0; i<n; i++) {
889           /* set the pointers */
890           v  = aa + ai[i];
891           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892 
893           /* first, set the diagonal elements */
894           AAUp[offset] = 1.0/v[nz];
895           AALo[offset] = 1.0/v[nz];
896 
897           offset+=1;
898           if (nz>0) {
899             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900             for (j=offset; j<offset+nz; j++) {
901               AAUp[j] = -AAUp[j];
902               AALo[j] = AAUp[j]/v[nz];
903             }
904             offset+=nz;
905           }
906         }
907         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912       }
913       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
914       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915     } catch(char *ex) {
916       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917     }
918   }
919   PetscFunctionReturn(0);
920 }
921 
922 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
923 {
924   PetscErrorCode               ierr;
925   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927   IS                           ip = a->row;
928   PetscBool                    perm_identity;
929   PetscInt                     n = A->rmap->n;
930 
931   PetscFunctionBegin;
932   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936 
937   A->offloadmask = PETSC_OFFLOAD_BOTH;
938 
939   /* lower triangular indices */
940   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941   if (!perm_identity) {
942     IS             iip;
943     const PetscInt *irip,*rip;
944 
945     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
946     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
951     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
952     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
953     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956   }
957   PetscFunctionReturn(0);
958 }
959 
960 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961 {
962   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963   IS             ip = b->row;
964   PetscBool      perm_identity;
965   PetscErrorCode ierr;
966 
967   PetscFunctionBegin;
968   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064                         #endif
1065   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067 
1068   /* Create the solve analysis information */
1069   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1071 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078 #endif
1079 
1080   /* perform the solve analysis */
1081   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                            loTriFactorT->csrMat->column_indices->data().get(),
1085                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086                            loTriFactorT->solveInfo,
1087                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088                           #else
1089                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090                           #endif
1091   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093 
1094   /* assign the pointer */
1095   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096 
1097   /*********************************************/
1098   /* Now the Transpose of the Upper Tri Factor */
1099   /*********************************************/
1100 
1101   /* allocate space for the transpose of the upper triangular factor */
1102   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104 
1105   /* set the matrix descriptors of the upper triangular factor */
1106   matrixType = cusparseGetMatType(upTriFactor->descr);
1107   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111 
1112   /* Create the matrix description */
1113   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1117   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118 
1119   /* set the operation */
1120   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121 
1122   /* allocate GPU space for the CSC of the upper triangular factor*/
1123   upTriFactorT->csrMat = new CsrMatrix;
1124   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130 
1131   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135                                 upTriFactor->csrMat->values->data().get(),
1136                                 upTriFactor->csrMat->row_offsets->data().get(),
1137                                 upTriFactor->csrMat->column_indices->data().get(),
1138                                 upTriFactorT->csrMat->values->data().get(),
1139                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143 #endif
1144 
1145   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148                           upTriFactor->csrMat->values->data().get(),
1149                           upTriFactor->csrMat->row_offsets->data().get(),
1150                           upTriFactor->csrMat->column_indices->data().get(),
1151                           upTriFactorT->csrMat->values->data().get(),
1152                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156                         #else
1157                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159                         #endif
1160 
1161   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163 
1164   /* Create the solve analysis information */
1165   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1167   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174   #endif
1175 
1176   /* perform the solve analysis */
1177   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180                            upTriFactorT->csrMat->column_indices->data().get(),
1181                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182                            upTriFactorT->solveInfo,
1183                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184                           #else
1185                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186                           #endif
1187 
1188   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190 
1191   /* assign the pointer */
1192   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193   PetscFunctionReturn(0);
1194 }
1195 
1196 struct PetscScalarToPetscInt
1197 {
1198   __host__ __device__
1199   PetscInt operator()(PetscScalar s)
1200   {
1201     return (PetscInt)PetscRealPart(s);
1202   }
1203 };
1204 
1205 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206 {
1207   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210   cusparseStatus_t             stat;
1211   cusparseIndexBase_t          indexBase;
1212   cudaError_t                  err;
1213   PetscErrorCode               ierr;
1214 
1215   PetscFunctionBegin;
1216   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
1222   if (A->transupdated) PetscFunctionReturn(0);
1223   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227   }
1228   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1230     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1232     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1233     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234 
1235     /* set alpha and beta */
1236     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1237     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1238     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1240     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1241     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242 
1243     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244       CsrMatrix *matrixT = new CsrMatrix;
1245       matstructT->mat = matrixT;
1246       matrixT->num_rows = A->cmap->n;
1247       matrixT->num_cols = A->rmap->n;
1248       matrixT->num_entries = a->nz;
1249       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251       matrixT->values = new THRUSTARRAY(a->nz);
1252 
1253       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1254       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255 
1256      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257       stat = cusparseCreateCsr(&matstructT->matDescr,
1258                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260                                matrixT->values->data().get(),
1261                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263      #endif
1264     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267    #else
1268       CsrMatrix *temp  = new CsrMatrix;
1269       CsrMatrix *tempT = new CsrMatrix;
1270       /* First convert HYB to CSR */
1271       temp->num_rows = A->rmap->n;
1272       temp->num_cols = A->cmap->n;
1273       temp->num_entries = a->nz;
1274       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276       temp->values = new THRUSTARRAY(a->nz);
1277 
1278       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280                               temp->values->data().get(),
1281                               temp->row_offsets->data().get(),
1282                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283 
1284       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285       tempT->num_rows = A->rmap->n;
1286       tempT->num_cols = A->cmap->n;
1287       tempT->num_entries = a->nz;
1288       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290       tempT->values = new THRUSTARRAY(a->nz);
1291 
1292       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293                               temp->num_cols, temp->num_entries,
1294                               temp->values->data().get(),
1295                               temp->row_offsets->data().get(),
1296                               temp->column_indices->data().get(),
1297                               tempT->values->data().get(),
1298                               tempT->column_indices->data().get(),
1299                               tempT->row_offsets->data().get(),
1300                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301 
1302       /* Last, convert CSC to HYB */
1303       cusparseHybMat_t hybMat;
1304       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308                               matstructT->descr, tempT->values->data().get(),
1309                               tempT->row_offsets->data().get(),
1310                               tempT->column_indices->data().get(),
1311                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312 
1313       /* assign the pointer */
1314       matstructT->mat = hybMat;
1315       A->transupdated = PETSC_TRUE;
1316       /* delete temporaries */
1317       if (tempT) {
1318         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321         delete (CsrMatrix*) tempT;
1322       }
1323       if (temp) {
1324         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327         delete (CsrMatrix*) temp;
1328       }
1329      #endif
1330     }
1331   }
1332   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347     }
1348     if (!cusparsestruct->csr2csc_i) {
1349       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351 
1352       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354       void   *csr2cscBuffer;
1355       size_t csr2cscBufferSize;
1356       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357                                            A->cmap->n, matrix->num_entries,
1358                                            matrix->values->data().get(),
1359                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360                                            matrix->column_indices->data().get(),
1361                                            matrixT->values->data().get(),
1362                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366      #endif
1367 
1368       if (matrix->num_entries) {
1369         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1370            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1371            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1372 
1373            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1374            should be filled with indexBase. So I just take a shortcut here.
1375         */
1376         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1377                               A->cmap->n,matrix->num_entries,
1378                               csr2csc_a.data().get(),
1379                               cusparsestruct->rowoffsets_gpu->data().get(),
1380                               matrix->column_indices->data().get(),
1381                               matrixT->values->data().get(),
1382                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384                               CUSPARSE_ACTION_NUMERIC,indexBase,
1385                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386                              #else
1387                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1388                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389                              #endif
1390       } else {
1391         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1392       }
1393 
1394       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398      #endif
1399     }
1400     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402                                                      matrixT->values->begin()));
1403   }
1404   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1405   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406   /* the compressed row indices is not used for matTranspose */
1407   matstructT->cprowIndices = NULL;
1408   /* assign the pointer */
1409   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1410   A->transupdated = PETSC_TRUE;
1411   PetscFunctionReturn(0);
1412 }
1413 
1414 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1415 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416 {
1417   PetscInt                              n = xx->map->n;
1418   const PetscScalar                     *barray;
1419   PetscScalar                           *xarray;
1420   thrust::device_ptr<const PetscScalar> bGPU;
1421   thrust::device_ptr<PetscScalar>       xGPU;
1422   cusparseStatus_t                      stat;
1423   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427   PetscErrorCode                        ierr;
1428 
1429   PetscFunctionBegin;
1430   /* Analyze the matrix and create the transpose ... on the fly */
1431   if (!loTriFactorT && !upTriFactorT) {
1432     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435   }
1436 
1437   /* Get the GPU pointers */
1438   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440   xGPU = thrust::device_pointer_cast(xarray);
1441   bGPU = thrust::device_pointer_cast(barray);
1442 
1443   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444   /* First, reorder with the row permutation */
1445   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447                xGPU);
1448 
1449   /* First, solve U */
1450   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451                         upTriFactorT->csrMat->num_rows,
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         upTriFactorT->csrMat->num_entries,
1454                       #endif
1455                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456                         upTriFactorT->csrMat->values->data().get(),
1457                         upTriFactorT->csrMat->row_offsets->data().get(),
1458                         upTriFactorT->csrMat->column_indices->data().get(),
1459                         upTriFactorT->solveInfo,
1460                         xarray,
1461                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462                         tempGPU->data().get(),
1463                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464                       #else
1465                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466                       #endif
1467 
1468   /* Then, solve L */
1469   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470                         loTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         loTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475                         loTriFactorT->csrMat->values->data().get(),
1476                         loTriFactorT->csrMat->row_offsets->data().get(),
1477                         loTriFactorT->csrMat->column_indices->data().get(),
1478                         loTriFactorT->solveInfo,
1479                         tempGPU->data().get(),
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         xarray,
1482                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483                       #else
1484                          xarray);CHKERRCUSPARSE(stat);
1485                       #endif
1486 
1487   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490                tempGPU->begin());
1491 
1492   /* Copy the temporary to the full solution. */
1493   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494 
1495   /* restore */
1496   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504 {
1505   const PetscScalar                 *barray;
1506   PetscScalar                       *xarray;
1507   cusparseStatus_t                  stat;
1508   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512   PetscErrorCode                    ierr;
1513 
1514   PetscFunctionBegin;
1515   /* Analyze the matrix and create the transpose ... on the fly */
1516   if (!loTriFactorT && !upTriFactorT) {
1517     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520   }
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525 
1526   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527   /* First, solve U */
1528   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529                         upTriFactorT->csrMat->num_rows,
1530                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531                         upTriFactorT->csrMat->num_entries,
1532                       #endif
1533                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534                         upTriFactorT->csrMat->values->data().get(),
1535                         upTriFactorT->csrMat->row_offsets->data().get(),
1536                         upTriFactorT->csrMat->column_indices->data().get(),
1537                         upTriFactorT->solveInfo,
1538                         barray,
1539                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540                         tempGPU->data().get(),
1541                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542                       #else
1543                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544                       #endif
1545 
1546   /* Then, solve L */
1547   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548                         loTriFactorT->csrMat->num_rows,
1549                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550                         loTriFactorT->csrMat->num_entries,
1551                       #endif
1552                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553                         loTriFactorT->csrMat->values->data().get(),
1554                         loTriFactorT->csrMat->row_offsets->data().get(),
1555                         loTriFactorT->csrMat->column_indices->data().get(),
1556                         loTriFactorT->solveInfo,
1557                         tempGPU->data().get(),
1558                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559                         xarray,
1560                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561                       #else
1562                         xarray);CHKERRCUSPARSE(stat);
1563                       #endif
1564 
1565   /* restore */
1566   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570   PetscFunctionReturn(0);
1571 }
1572 
1573 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1574 {
1575   const PetscScalar                     *barray;
1576   PetscScalar                           *xarray;
1577   thrust::device_ptr<const PetscScalar> bGPU;
1578   thrust::device_ptr<PetscScalar>       xGPU;
1579   cusparseStatus_t                      stat;
1580   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584   PetscErrorCode                        ierr;
1585 
1586   PetscFunctionBegin;
1587 
1588   /* Get the GPU pointers */
1589   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591   xGPU = thrust::device_pointer_cast(xarray);
1592   bGPU = thrust::device_pointer_cast(barray);
1593 
1594   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595   /* First, reorder with the row permutation */
1596   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1598                tempGPU->begin());
1599 
1600   /* Next, solve L */
1601   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602                         loTriFactor->csrMat->num_rows,
1603                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604                         loTriFactor->csrMat->num_entries,
1605                       #endif
1606                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607                         loTriFactor->csrMat->values->data().get(),
1608                         loTriFactor->csrMat->row_offsets->data().get(),
1609                         loTriFactor->csrMat->column_indices->data().get(),
1610                         loTriFactor->solveInfo,
1611                         tempGPU->data().get(),
1612                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613                          xarray,
1614                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615                       #else
1616                          xarray);CHKERRCUSPARSE(stat);
1617                       #endif
1618 
1619   /* Then, solve U */
1620   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621                         upTriFactor->csrMat->num_rows,
1622                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623                         upTriFactor->csrMat->num_entries,
1624                       #endif
1625                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626                         upTriFactor->csrMat->values->data().get(),
1627                         upTriFactor->csrMat->row_offsets->data().get(),
1628                         upTriFactor->csrMat->column_indices->data().get(),
1629                         upTriFactor->solveInfo,xarray,
1630                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631                         tempGPU->data().get(),
1632                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633                       #else
1634                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635                       #endif
1636 
1637 
1638   /* Last, reorder with the column permutation */
1639   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1640                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1641                xGPU);
1642 
1643   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1644   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1645   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1646   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1647   PetscFunctionReturn(0);
1648 }
1649 
1650 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1651 {
1652   const PetscScalar                 *barray;
1653   PetscScalar                       *xarray;
1654   cusparseStatus_t                  stat;
1655   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1656   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1657   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1658   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1659   PetscErrorCode                    ierr;
1660 
1661   PetscFunctionBegin;
1662   /* Get the GPU pointers */
1663   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1664   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1665 
1666   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1667   /* First, solve L */
1668   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1669                         loTriFactor->csrMat->num_rows,
1670                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1671                         loTriFactor->csrMat->num_entries,
1672                       #endif
1673                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1674                         loTriFactor->csrMat->values->data().get(),
1675                         loTriFactor->csrMat->row_offsets->data().get(),
1676                         loTriFactor->csrMat->column_indices->data().get(),
1677                         loTriFactor->solveInfo,
1678                         barray,
1679                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680                         tempGPU->data().get(),
1681                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1682                       #else
1683                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1684                       #endif
1685 
1686 
1687   /* Next, solve U */
1688   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1689                         upTriFactor->csrMat->num_rows,
1690                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1691                         upTriFactor->csrMat->num_entries,
1692                       #endif
1693                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1694                         upTriFactor->csrMat->values->data().get(),
1695                         upTriFactor->csrMat->row_offsets->data().get(),
1696                         upTriFactor->csrMat->column_indices->data().get(),
1697                         upTriFactor->solveInfo,
1698                         tempGPU->data().get(),
1699                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1700                         xarray,
1701                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1702                       #else
1703                         xarray);CHKERRCUSPARSE(stat);
1704                       #endif
1705 
1706   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1707   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1708   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1709   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1714 {
1715   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1716   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1717   cudaError_t        cerr;
1718   PetscErrorCode     ierr;
1719 
1720   PetscFunctionBegin;
1721   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1722     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1723 
1724     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1725     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1726     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1727     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1728     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1729     A->offloadmask = PETSC_OFFLOAD_BOTH;
1730   }
1731   PetscFunctionReturn(0);
1732 }
1733 
1734 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1735 {
1736   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1737   PetscErrorCode ierr;
1738 
1739   PetscFunctionBegin;
1740   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1741   *array = a->a;
1742   A->offloadmask = PETSC_OFFLOAD_CPU;
1743   PetscFunctionReturn(0);
1744 }
1745 
1746 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1747 {
1748   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1749   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1750   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1751   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1752   PetscErrorCode               ierr;
1753   cusparseStatus_t             stat;
1754   PetscBool                    both = PETSC_TRUE;
1755   cudaError_t                  err;
1756 
1757   PetscFunctionBegin;
1758   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1759   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1760     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1761       CsrMatrix *matrix;
1762       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1763 
1764       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
1765       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1766       matrix->values->assign(a->a, a->a+a->nz);
1767       err  = WaitForCUDA();CHKERRCUDA(err);
1768       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1769       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1770       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1771     } else {
1772       PetscInt nnz;
1773       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1774       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1775       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1776       delete cusparsestruct->workVector;
1777       delete cusparsestruct->rowoffsets_gpu;
1778       cusparsestruct->workVector = NULL;
1779       cusparsestruct->rowoffsets_gpu = NULL;
1780       try {
1781         if (a->compressedrow.use) {
1782           m    = a->compressedrow.nrows;
1783           ii   = a->compressedrow.i;
1784           ridx = a->compressedrow.rindex;
1785         } else {
1786           m    = A->rmap->n;
1787           ii   = a->i;
1788           ridx = NULL;
1789         }
1790         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1791         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1792         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1793         else nnz = a->nz;
1794 
1795         /* create cusparse matrix */
1796         cusparsestruct->nrows = m;
1797         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1798         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1799         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1800         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1801 
1802         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1803         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1804         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1805         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1806         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1807         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1808         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1809 
1810         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1811         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1812           /* set the matrix */
1813           CsrMatrix *mat= new CsrMatrix;
1814           mat->num_rows = m;
1815           mat->num_cols = A->cmap->n;
1816           mat->num_entries = nnz;
1817           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1818           mat->row_offsets->assign(ii, ii + m+1);
1819 
1820           mat->column_indices = new THRUSTINTARRAY32(nnz);
1821           mat->column_indices->assign(a->j, a->j+nnz);
1822 
1823           mat->values = new THRUSTARRAY(nnz);
1824           if (a->a) mat->values->assign(a->a, a->a+nnz);
1825 
1826           /* assign the pointer */
1827           matstruct->mat = mat;
1828          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1829           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1830             stat = cusparseCreateCsr(&matstruct->matDescr,
1831                                     mat->num_rows, mat->num_cols, mat->num_entries,
1832                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1833                                     mat->values->data().get(),
1834                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1835                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1836           }
1837          #endif
1838         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1839          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1840           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1841          #else
1842           CsrMatrix *mat= new CsrMatrix;
1843           mat->num_rows = m;
1844           mat->num_cols = A->cmap->n;
1845           mat->num_entries = nnz;
1846           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1847           mat->row_offsets->assign(ii, ii + m+1);
1848 
1849           mat->column_indices = new THRUSTINTARRAY32(nnz);
1850           mat->column_indices->assign(a->j, a->j+nnz);
1851 
1852           mat->values = new THRUSTARRAY(nnz);
1853           if (a->a) mat->values->assign(a->a, a->a+nnz);
1854 
1855           cusparseHybMat_t hybMat;
1856           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1857           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1858             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1859           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1860               matstruct->descr, mat->values->data().get(),
1861               mat->row_offsets->data().get(),
1862               mat->column_indices->data().get(),
1863               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1864           /* assign the pointer */
1865           matstruct->mat = hybMat;
1866 
1867           if (mat) {
1868             if (mat->values) delete (THRUSTARRAY*)mat->values;
1869             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1870             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1871             delete (CsrMatrix*)mat;
1872           }
1873          #endif
1874         }
1875 
1876         /* assign the compressed row indices */
1877         if (a->compressedrow.use) {
1878           cusparsestruct->workVector = new THRUSTARRAY(m);
1879           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1880           matstruct->cprowIndices->assign(ridx,ridx+m);
1881           tmp = m;
1882         } else {
1883           cusparsestruct->workVector = NULL;
1884           matstruct->cprowIndices    = NULL;
1885           tmp = 0;
1886         }
1887         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1888 
1889         /* assign the pointer */
1890         cusparsestruct->mat = matstruct;
1891       } catch(char *ex) {
1892         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1893       }
1894       err  = WaitForCUDA();CHKERRCUDA(err);
1895       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1896       cusparsestruct->nonzerostate = A->nonzerostate;
1897     }
1898     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1899   }
1900   PetscFunctionReturn(0);
1901 }
1902 
1903 struct VecCUDAPlusEquals
1904 {
1905   template <typename Tuple>
1906   __host__ __device__
1907   void operator()(Tuple t)
1908   {
1909     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1910   }
1911 };
1912 
1913 struct VecCUDAEquals
1914 {
1915   template <typename Tuple>
1916   __host__ __device__
1917   void operator()(Tuple t)
1918   {
1919     thrust::get<1>(t) = thrust::get<0>(t);
1920   }
1921 };
1922 
1923 struct VecCUDAEqualsReverse
1924 {
1925   template <typename Tuple>
1926   __host__ __device__
1927   void operator()(Tuple t)
1928   {
1929     thrust::get<0>(t) = thrust::get<1>(t);
1930   }
1931 };
1932 
1933 struct MatMatCusparse {
1934   PetscBool             cisdense;
1935   PetscScalar           *Bt;
1936   Mat                   X;
1937   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1938   PetscLogDouble        flops;
1939   CsrMatrix             *Bcsr;
1940 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1941   cusparseSpMatDescr_t  matSpBDescr;
1942   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1943   cusparseDnMatDescr_t  matBDescr;
1944   cusparseDnMatDescr_t  matCDescr;
1945   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1946   size_t                mmBufferSize;
1947   void                  *mmBuffer;
1948   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1949   cusparseSpGEMMDescr_t spgemmDesc;
1950 #endif
1951 };
1952 
1953 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1954 {
1955   PetscErrorCode   ierr;
1956   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1957   cudaError_t      cerr;
1958  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1959   cusparseStatus_t stat;
1960  #endif
1961 
1962   PetscFunctionBegin;
1963   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1964   delete mmdata->Bcsr;
1965  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1966   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1967   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1968   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1969   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1970   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1971   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1972  #endif
1973   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1974   ierr = PetscFree(data);CHKERRQ(ierr);
1975   PetscFunctionReturn(0);
1976 }
1977 
1978 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1979 
1980 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1981 {
1982   Mat_Product                  *product = C->product;
1983   Mat                          A,B;
1984   PetscInt                     m,n,blda,clda;
1985   PetscBool                    flg,biscuda;
1986   Mat_SeqAIJCUSPARSE           *cusp;
1987   cusparseStatus_t             stat;
1988   cusparseOperation_t          opA;
1989   const PetscScalar            *barray;
1990   PetscScalar                  *carray;
1991   PetscErrorCode               ierr;
1992   MatMatCusparse               *mmdata;
1993   Mat_SeqAIJCUSPARSEMultStruct *mat;
1994   CsrMatrix                    *csrmat;
1995 
1996   PetscFunctionBegin;
1997   MatCheckProduct(C,1);
1998   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1999   mmdata = (MatMatCusparse*)product->data;
2000   A    = product->A;
2001   B    = product->B;
2002   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2003   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2004   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2005      Instead of silently accepting the wrong answer, I prefer to raise the error */
2006   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2007   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2008   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2009   switch (product->type) {
2010   case MATPRODUCT_AB:
2011   case MATPRODUCT_PtAP:
2012     mat = cusp->mat;
2013     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2014     m   = A->rmap->n;
2015     n   = B->cmap->n;
2016     break;
2017   case MATPRODUCT_AtB:
2018     if (!A->form_explicit_transpose) {
2019       mat = cusp->mat;
2020       opA = CUSPARSE_OPERATION_TRANSPOSE;
2021     } else {
2022       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2023       mat  = cusp->matTranspose;
2024       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2025     }
2026     m = A->cmap->n;
2027     n = B->cmap->n;
2028     break;
2029   case MATPRODUCT_ABt:
2030   case MATPRODUCT_RARt:
2031     mat = cusp->mat;
2032     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2033     m   = A->rmap->n;
2034     n   = B->rmap->n;
2035     break;
2036   default:
2037     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2038   }
2039   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2040   csrmat = (CsrMatrix*)mat->mat;
2041   /* if the user passed a CPU matrix, copy the data to the GPU */
2042   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2043   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2044   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2045 
2046   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2047   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2048     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2049     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2050   } else {
2051     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2052     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2053   }
2054 
2055   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2056  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2057   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2058   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2059   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2060     size_t mmBufferSize;
2061     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2062     if (!mmdata->matBDescr) {
2063       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2064       mmdata->Blda = blda;
2065     }
2066 
2067     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2068     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2069       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2070       mmdata->Clda = clda;
2071     }
2072 
2073     if (!mat->matDescr) {
2074       stat = cusparseCreateCsr(&mat->matDescr,
2075                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2076                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2077                                csrmat->values->data().get(),
2078                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2079                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2080     }
2081     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2082                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2083                                    mmdata->matCDescr,cusparse_scalartype,
2084                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2085     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2086       cudaError_t cerr;
2087       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2088       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2089       mmdata->mmBufferSize = mmBufferSize;
2090     }
2091     mmdata->initialized = PETSC_TRUE;
2092   } else {
2093     /* to be safe, always update pointers of the mats */
2094     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2095     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2096     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2097   }
2098 
2099   /* do cusparseSpMM, which supports transpose on B */
2100   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2101                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2102                       mmdata->matCDescr,cusparse_scalartype,
2103                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2104  #else
2105   PetscInt k;
2106   /* cusparseXcsrmm does not support transpose on B */
2107   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2108     cublasHandle_t cublasv2handle;
2109     cublasStatus_t cerr;
2110 
2111     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2112     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2113                        B->cmap->n,B->rmap->n,
2114                        &PETSC_CUSPARSE_ONE ,barray,blda,
2115                        &PETSC_CUSPARSE_ZERO,barray,blda,
2116                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2117     blda = B->cmap->n;
2118     k    = B->cmap->n;
2119   } else {
2120     k    = B->rmap->n;
2121   }
2122 
2123   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2124   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2125                            csrmat->num_entries,mat->alpha_one,mat->descr,
2126                            csrmat->values->data().get(),
2127                            csrmat->row_offsets->data().get(),
2128                            csrmat->column_indices->data().get(),
2129                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2130                            carray,clda);CHKERRCUSPARSE(stat);
2131  #endif
2132   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2133   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2134   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2135   if (product->type == MATPRODUCT_RARt) {
2136     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2137     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2138   } else if (product->type == MATPRODUCT_PtAP) {
2139     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2140     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2141   } else {
2142     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2143   }
2144   if (mmdata->cisdense) {
2145     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2146   }
2147   if (!biscuda) {
2148     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2149   }
2150   PetscFunctionReturn(0);
2151 }
2152 
2153 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2154 {
2155   Mat_Product        *product = C->product;
2156   Mat                A,B;
2157   PetscInt           m,n;
2158   PetscBool          cisdense,flg;
2159   PetscErrorCode     ierr;
2160   MatMatCusparse     *mmdata;
2161   Mat_SeqAIJCUSPARSE *cusp;
2162 
2163   PetscFunctionBegin;
2164   MatCheckProduct(C,1);
2165   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2166   A    = product->A;
2167   B    = product->B;
2168   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2169   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2170   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2171   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2172   switch (product->type) {
2173   case MATPRODUCT_AB:
2174     m = A->rmap->n;
2175     n = B->cmap->n;
2176     break;
2177   case MATPRODUCT_AtB:
2178     m = A->cmap->n;
2179     n = B->cmap->n;
2180     break;
2181   case MATPRODUCT_ABt:
2182     m = A->rmap->n;
2183     n = B->rmap->n;
2184     break;
2185   case MATPRODUCT_PtAP:
2186     m = B->cmap->n;
2187     n = B->cmap->n;
2188     break;
2189   case MATPRODUCT_RARt:
2190     m = B->rmap->n;
2191     n = B->rmap->n;
2192     break;
2193   default:
2194     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2195   }
2196   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2197   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2198   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2199   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2200 
2201   /* product data */
2202   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2203   mmdata->cisdense = cisdense;
2204  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2205   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2206   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2207     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2208   }
2209  #endif
2210   /* for these products we need intermediate storage */
2211   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2212     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2213     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2214     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2215       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2216     } else {
2217       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2218     }
2219   }
2220   C->product->data    = mmdata;
2221   C->product->destroy = MatDestroy_MatMatCusparse;
2222 
2223   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2224   PetscFunctionReturn(0);
2225 }
2226 
2227 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2228 {
2229   Mat_Product                  *product = C->product;
2230   Mat                          A,B;
2231   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2232   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2233   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2234   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2235   PetscBool                    flg;
2236   PetscErrorCode               ierr;
2237   cusparseStatus_t             stat;
2238   cudaError_t                  cerr;
2239   MatProductType               ptype;
2240   MatMatCusparse               *mmdata;
2241 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2242   cusparseSpMatDescr_t         BmatSpDescr;
2243 #endif
2244 
2245   PetscFunctionBegin;
2246   MatCheckProduct(C,1);
2247   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2248   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2249   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2250   mmdata = (MatMatCusparse*)C->product->data;
2251   A = product->A;
2252   B = product->B;
2253   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2254     mmdata->reusesym = PETSC_FALSE;
2255     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2256     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2257     Cmat = Ccusp->mat;
2258     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2259     Ccsr = (CsrMatrix*)Cmat->mat;
2260     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2261     goto finalize;
2262   }
2263   if (!c->nz) goto finalize;
2264   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2265   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2266   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2267   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2268   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2269   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2270   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2271   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2272   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2273   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2274   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2275   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2276   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2277   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2278 
2279   ptype = product->type;
2280   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2281   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2282   switch (ptype) {
2283   case MATPRODUCT_AB:
2284     Amat = Acusp->mat;
2285     Bmat = Bcusp->mat;
2286     break;
2287   case MATPRODUCT_AtB:
2288     Amat = Acusp->matTranspose;
2289     Bmat = Bcusp->mat;
2290     break;
2291   case MATPRODUCT_ABt:
2292     Amat = Acusp->mat;
2293     Bmat = Bcusp->matTranspose;
2294     break;
2295   default:
2296     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2297   }
2298   Cmat = Ccusp->mat;
2299   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2300   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2301   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2302   Acsr = (CsrMatrix*)Amat->mat;
2303   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2304   Ccsr = (CsrMatrix*)Cmat->mat;
2305   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2306   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2307   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2308   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2309 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2311   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2312                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2313                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2314                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2315   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2316                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2317                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2318 #else
2319   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2320                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2321                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2322                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2323                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2324 #endif
2325   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2326   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2327   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2328   C->offloadmask = PETSC_OFFLOAD_GPU;
2329 finalize:
2330   /* shorter version of MatAssemblyEnd_SeqAIJ */
2331   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2332   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2333   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2334   c->reallocs         = 0;
2335   C->info.mallocs    += 0;
2336   C->info.nz_unneeded = 0;
2337   C->assembled = C->was_assembled = PETSC_TRUE;
2338   C->num_ass++;
2339   PetscFunctionReturn(0);
2340 }
2341 
2342 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2343 {
2344   Mat_Product                  *product = C->product;
2345   Mat                          A,B;
2346   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2347   Mat_SeqAIJ                   *a,*b,*c;
2348   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2349   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2350   PetscInt                     i,j,m,n,k;
2351   PetscBool                    flg;
2352   PetscErrorCode               ierr;
2353   cusparseStatus_t             stat;
2354   cudaError_t                  cerr;
2355   MatProductType               ptype;
2356   MatMatCusparse               *mmdata;
2357   PetscLogDouble               flops;
2358   PetscBool                    biscompressed,ciscompressed;
2359 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2360   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2361   size_t                       bufSize2;
2362   cusparseSpMatDescr_t         BmatSpDescr;
2363 #else
2364   int                          cnz;
2365 #endif
2366 
2367   PetscFunctionBegin;
2368   MatCheckProduct(C,1);
2369   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2370   A    = product->A;
2371   B    = product->B;
2372   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2373   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2374   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2375   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2376   a = (Mat_SeqAIJ*)A->data;
2377   b = (Mat_SeqAIJ*)B->data;
2378   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2379   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2380   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2381   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2382 
2383   /* product data */
2384   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2385   C->product->data    = mmdata;
2386   C->product->destroy = MatDestroy_MatMatCusparse;
2387 
2388   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2389   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2390   ptype = product->type;
2391   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2392   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2393   biscompressed = PETSC_FALSE;
2394   ciscompressed = PETSC_FALSE;
2395   switch (ptype) {
2396   case MATPRODUCT_AB:
2397     m = A->rmap->n;
2398     n = B->cmap->n;
2399     k = A->cmap->n;
2400     Amat = Acusp->mat;
2401     Bmat = Bcusp->mat;
2402     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2403     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2404     break;
2405   case MATPRODUCT_AtB:
2406     m = A->cmap->n;
2407     n = B->cmap->n;
2408     k = A->rmap->n;
2409     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2410     Amat = Acusp->matTranspose;
2411     Bmat = Bcusp->mat;
2412     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2413     break;
2414   case MATPRODUCT_ABt:
2415     m = A->rmap->n;
2416     n = B->rmap->n;
2417     k = A->cmap->n;
2418     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2419     Amat = Acusp->mat;
2420     Bmat = Bcusp->matTranspose;
2421     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2422     break;
2423   default:
2424     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2425   }
2426 
2427   /* create cusparse matrix */
2428   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2429   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2430   c     = (Mat_SeqAIJ*)C->data;
2431   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2432   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2433   Ccsr  = new CsrMatrix;
2434 
2435   c->compressedrow.use = ciscompressed;
2436   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2437     c->compressedrow.nrows = a->compressedrow.nrows;
2438     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2439     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2440     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2441     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2442     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2443   } else {
2444     c->compressedrow.nrows  = 0;
2445     c->compressedrow.i      = NULL;
2446     c->compressedrow.rindex = NULL;
2447     Ccusp->workVector       = NULL;
2448     Cmat->cprowIndices      = NULL;
2449   }
2450   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2451   Ccusp->mat      = Cmat;
2452   Ccusp->mat->mat = Ccsr;
2453   Ccsr->num_rows    = Ccusp->nrows;
2454   Ccsr->num_cols    = n;
2455   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2456   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2457   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2458   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2459   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2460   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2461   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2462   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2463   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2464   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2465   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2466     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2467     c->nz = 0;
2468     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2469     Ccsr->values = new THRUSTARRAY(c->nz);
2470     goto finalizesym;
2471   }
2472 
2473   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2474   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2475   Acsr = (CsrMatrix*)Amat->mat;
2476   if (!biscompressed) {
2477     Bcsr = (CsrMatrix*)Bmat->mat;
2478 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2479     BmatSpDescr = Bmat->matDescr;
2480 #endif
2481   } else { /* we need to use row offsets for the full matrix */
2482     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2483     Bcsr = new CsrMatrix;
2484     Bcsr->num_rows       = B->rmap->n;
2485     Bcsr->num_cols       = cBcsr->num_cols;
2486     Bcsr->num_entries    = cBcsr->num_entries;
2487     Bcsr->column_indices = cBcsr->column_indices;
2488     Bcsr->values         = cBcsr->values;
2489     if (!Bcusp->rowoffsets_gpu) {
2490       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2491       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2492       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2493     }
2494     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2495     mmdata->Bcsr = Bcsr;
2496 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2497     if (Bcsr->num_rows && Bcsr->num_cols) {
2498       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2499                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2500                                Bcsr->values->data().get(),
2501                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2502                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2503     }
2504     BmatSpDescr = mmdata->matSpBDescr;
2505 #endif
2506   }
2507   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2508   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2509   /* precompute flops count */
2510   if (ptype == MATPRODUCT_AB) {
2511     for (i=0, flops = 0; i<A->rmap->n; i++) {
2512       const PetscInt st = a->i[i];
2513       const PetscInt en = a->i[i+1];
2514       for (j=st; j<en; j++) {
2515         const PetscInt brow = a->j[j];
2516         flops += 2.*(b->i[brow+1] - b->i[brow]);
2517       }
2518     }
2519   } else if (ptype == MATPRODUCT_AtB) {
2520     for (i=0, flops = 0; i<A->rmap->n; i++) {
2521       const PetscInt anzi = a->i[i+1] - a->i[i];
2522       const PetscInt bnzi = b->i[i+1] - b->i[i];
2523       flops += (2.*anzi)*bnzi;
2524     }
2525   } else { /* TODO */
2526     flops = 0.;
2527   }
2528 
2529   mmdata->flops = flops;
2530   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2531 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2532   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2533   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2534                            NULL, NULL, NULL,
2535                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2536                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2537   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2538   /* ask bufferSize bytes for external memory */
2539   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2540                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2541                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2542                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2543   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2544   /* inspect the matrices A and B to understand the memory requirement for the next step */
2545   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2546                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2547                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2548                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2549   /* ask bufferSize again bytes for external memory */
2550   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2551                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2552                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2553                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2554   /* The CUSPARSE documentation is not clear, nor the API
2555      We need both buffers to perform the operations properly!
2556      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2557      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2558      is stored in the descriptor! What a messy API... */
2559   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2560   /* compute the intermediate product of A * B */
2561   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2562                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2563                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2564                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2565   /* get matrix C non-zero entries C_nnz1 */
2566   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2567   c->nz = (PetscInt) C_nnz1;
2568   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2569   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2570   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571   Ccsr->values = new THRUSTARRAY(c->nz);
2572   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2573   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2574                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2575   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2576                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2577                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2578 #else
2579   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2580   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2581                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2582                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2583                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2584                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2585   c->nz = cnz;
2586   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2587   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2588   Ccsr->values = new THRUSTARRAY(c->nz);
2589   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2590 
2591   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2592   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2593      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2594      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2595   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2596                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2597                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2598                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2599                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2600 #endif
2601   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2602   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2603 finalizesym:
2604   c->singlemalloc = PETSC_FALSE;
2605   c->free_a       = PETSC_TRUE;
2606   c->free_ij      = PETSC_TRUE;
2607   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2608   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2609   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2610     PetscInt *d_i = c->i;
2611     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2612     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2613     ii   = *Ccsr->row_offsets;
2614     jj   = *Ccsr->column_indices;
2615     if (ciscompressed) d_i = c->compressedrow.i;
2616     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2618   } else {
2619     PetscInt *d_i = c->i;
2620     if (ciscompressed) d_i = c->compressedrow.i;
2621     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2622     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2623   }
2624   if (ciscompressed) { /* need to expand host row offsets */
2625     PetscInt r = 0;
2626     c->i[0] = 0;
2627     for (k = 0; k < c->compressedrow.nrows; k++) {
2628       const PetscInt next = c->compressedrow.rindex[k];
2629       const PetscInt old = c->compressedrow.i[k];
2630       for (; r < next; r++) c->i[r+1] = old;
2631     }
2632     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2633   }
2634   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2635   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2636   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2637   c->maxnz = c->nz;
2638   c->nonzerorowcnt = 0;
2639   c->rmax = 0;
2640   for (k = 0; k < m; k++) {
2641     const PetscInt nn = c->i[k+1] - c->i[k];
2642     c->ilen[k] = c->imax[k] = nn;
2643     c->nonzerorowcnt += (PetscInt)!!nn;
2644     c->rmax = PetscMax(c->rmax,nn);
2645   }
2646   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2647   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2648   Ccsr->num_entries = c->nz;
2649 
2650   C->nonzerostate++;
2651   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2652   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2653   Ccusp->nonzerostate = C->nonzerostate;
2654   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2655   C->preallocated  = PETSC_TRUE;
2656   C->assembled     = PETSC_FALSE;
2657   C->was_assembled = PETSC_FALSE;
2658   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2659     mmdata->reusesym = PETSC_TRUE;
2660     C->offloadmask   = PETSC_OFFLOAD_GPU;
2661   }
2662   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2663   PetscFunctionReturn(0);
2664 }
2665 
2666 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2667 
2668 /* handles sparse or dense B */
2669 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2670 {
2671   Mat_Product    *product = mat->product;
2672   PetscErrorCode ierr;
2673   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2674 
2675   PetscFunctionBegin;
2676   MatCheckProduct(mat,1);
2677   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2678   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2679     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2680   }
2681   if (product->type == MATPRODUCT_ABC) {
2682     Ciscusp = PETSC_FALSE;
2683     if (!product->C->boundtocpu) {
2684       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2685     }
2686   }
2687   if (isdense) {
2688     switch (product->type) {
2689     case MATPRODUCT_AB:
2690     case MATPRODUCT_AtB:
2691     case MATPRODUCT_ABt:
2692     case MATPRODUCT_PtAP:
2693     case MATPRODUCT_RARt:
2694      if (product->A->boundtocpu) {
2695         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2696       } else {
2697         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2698       }
2699       break;
2700     case MATPRODUCT_ABC:
2701       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2702       break;
2703     default:
2704       break;
2705     }
2706   } else if (Biscusp && Ciscusp) {
2707     switch (product->type) {
2708     case MATPRODUCT_AB:
2709     case MATPRODUCT_AtB:
2710     case MATPRODUCT_ABt:
2711       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2712       break;
2713     case MATPRODUCT_PtAP:
2714     case MATPRODUCT_RARt:
2715     case MATPRODUCT_ABC:
2716       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2717       break;
2718     default:
2719       break;
2720     }
2721   } else { /* fallback for AIJ */
2722     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2723   }
2724   PetscFunctionReturn(0);
2725 }
2726 
2727 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2728 {
2729   PetscErrorCode ierr;
2730 
2731   PetscFunctionBegin;
2732   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2733   PetscFunctionReturn(0);
2734 }
2735 
2736 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2737 {
2738   PetscErrorCode ierr;
2739 
2740   PetscFunctionBegin;
2741   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2742   PetscFunctionReturn(0);
2743 }
2744 
2745 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2746 {
2747   PetscErrorCode ierr;
2748 
2749   PetscFunctionBegin;
2750   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2751   PetscFunctionReturn(0);
2752 }
2753 
2754 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2755 {
2756   PetscErrorCode ierr;
2757 
2758   PetscFunctionBegin;
2759   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2760   PetscFunctionReturn(0);
2761 }
2762 
2763 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2764 {
2765   PetscErrorCode ierr;
2766 
2767   PetscFunctionBegin;
2768   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2769   PetscFunctionReturn(0);
2770 }
2771 
2772 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2773 {
2774   int i = blockIdx.x*blockDim.x + threadIdx.x;
2775   if (i < n) y[idx[i]] += x[i];
2776 }
2777 
2778 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2779 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2780 {
2781   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2782   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2783   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2784   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2785   PetscErrorCode               ierr;
2786   cusparseStatus_t             stat;
2787   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2788   PetscBool                    compressed;
2789 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790   PetscInt                     nx,ny;
2791 #endif
2792 
2793   PetscFunctionBegin;
2794   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2795   if (!a->nonzerorowcnt) {
2796     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2797     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2798     PetscFunctionReturn(0);
2799   }
2800   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2801   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2802   if (!trans) {
2803     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2805   } else {
2806     if (herm || !A->form_explicit_transpose) {
2807       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2808       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2809     } else {
2810       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2811       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2812     }
2813   }
2814   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2815   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2816 
2817   try {
2818     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2819     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2820     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2821 
2822     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2823     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2824       /* z = A x + beta y.
2825          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2826          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2827       */
2828       xptr = xarray;
2829       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2830       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2831      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2832       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2833           allocated to accommodate different uses. So we get the length info directly from mat.
2834        */
2835       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2836         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2837         nx = mat->num_cols;
2838         ny = mat->num_rows;
2839       }
2840      #endif
2841     } else {
2842       /* z = A^T x + beta y
2843          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2844          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2845        */
2846       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2847       dptr = zarray;
2848       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2849       if (compressed) { /* Scatter x to work vector */
2850         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2851         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2852                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2853                          VecCUDAEqualsReverse());
2854       }
2855      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2856       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2857         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2858         nx = mat->num_rows;
2859         ny = mat->num_cols;
2860       }
2861      #endif
2862     }
2863 
2864     /* csr_spmv does y = alpha op(A) x + beta y */
2865     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2866      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2867       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2868       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2869         cudaError_t cerr;
2870         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2871         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2872         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2873                                 matstruct->matDescr,
2874                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2875                                 matstruct->cuSpMV[opA].vecYDescr,
2876                                 cusparse_scalartype,
2877                                 cusparsestruct->spmvAlg,
2878                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2879         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2880 
2881         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2882       } else {
2883         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2884         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2885         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2886       }
2887 
2888       stat = cusparseSpMV(cusparsestruct->handle, opA,
2889                                matstruct->alpha_one,
2890                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2891                                matstruct->cuSpMV[opA].vecXDescr,
2892                                beta,
2893                                matstruct->cuSpMV[opA].vecYDescr,
2894                                cusparse_scalartype,
2895                                cusparsestruct->spmvAlg,
2896                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2897      #else
2898       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2899       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2900                                mat->num_rows, mat->num_cols,
2901                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2902                                mat->values->data().get(), mat->row_offsets->data().get(),
2903                                mat->column_indices->data().get(), xptr, beta,
2904                                dptr);CHKERRCUSPARSE(stat);
2905      #endif
2906     } else {
2907       if (cusparsestruct->nrows) {
2908        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2909         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2910        #else
2911         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2912         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2913                                  matstruct->alpha_one, matstruct->descr, hybMat,
2914                                  xptr, beta,
2915                                  dptr);CHKERRCUSPARSE(stat);
2916        #endif
2917       }
2918     }
2919     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2920 
2921     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2922       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2923         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2924           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2925         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2926           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2927         }
2928       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2929         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2930       }
2931 
2932       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2933       if (compressed) {
2934         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2935         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2936            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2937            prevent that. So I just add a ScatterAdd kernel.
2938          */
2939        #if 0
2940         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2941         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2942                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2943                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2944                          VecCUDAPlusEquals());
2945        #else
2946         PetscInt n = matstruct->cprowIndices->size();
2947         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2948        #endif
2949         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2950       }
2951     } else {
2952       if (yy && yy != zz) {
2953         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2954       }
2955     }
2956     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2957     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2958     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2959   } catch(char *ex) {
2960     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2961   }
2962   if (yy) {
2963     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2964   } else {
2965     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2966   }
2967   PetscFunctionReturn(0);
2968 }
2969 
2970 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2971 {
2972   PetscErrorCode ierr;
2973 
2974   PetscFunctionBegin;
2975   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2976   PetscFunctionReturn(0);
2977 }
2978 
2979 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2980 {
2981   PetscErrorCode              ierr;
2982   PetscSplitCSRDataStructure  *d_mat = NULL;
2983   PetscFunctionBegin;
2984   if (A->factortype == MAT_FACTOR_NONE) {
2985     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2986   }
2987   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2988   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2989   if (d_mat) {
2990     A->offloadmask = PETSC_OFFLOAD_GPU;
2991   }
2992 
2993   PetscFunctionReturn(0);
2994 }
2995 
2996 /* --------------------------------------------------------------------------------*/
2997 /*@
2998    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2999    (the default parallel PETSc format). This matrix will ultimately pushed down
3000    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3001    assembly performance the user should preallocate the matrix storage by setting
3002    the parameter nz (or the array nnz).  By setting these parameters accurately,
3003    performance during matrix assembly can be increased by more than a factor of 50.
3004 
3005    Collective
3006 
3007    Input Parameters:
3008 +  comm - MPI communicator, set to PETSC_COMM_SELF
3009 .  m - number of rows
3010 .  n - number of columns
3011 .  nz - number of nonzeros per row (same for all rows)
3012 -  nnz - array containing the number of nonzeros in the various rows
3013          (possibly different for each row) or NULL
3014 
3015    Output Parameter:
3016 .  A - the matrix
3017 
3018    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
3019    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3020    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
3021 
3022    Notes:
3023    If nnz is given then nz is ignored
3024 
3025    The AIJ format (also called the Yale sparse matrix format or
3026    compressed row storage), is fully compatible with standard Fortran 77
3027    storage.  That is, the stored row and column indices can begin at
3028    either one (as in Fortran) or zero.  See the users' manual for details.
3029 
3030    Specify the preallocated storage with either nz or nnz (not both).
3031    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3032    allocation.  For large problems you MUST preallocate memory or you
3033    will get TERRIBLE performance, see the users' manual chapter on matrices.
3034 
3035    By default, this format uses inodes (identical nodes) when possible, to
3036    improve numerical efficiency of matrix-vector products and solves. We
3037    search for consecutive rows with the same nonzero structure, thereby
3038    reusing matrix information to achieve increased efficiency.
3039 
3040    Level: intermediate
3041 
3042 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3043 @*/
3044 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3045 {
3046   PetscErrorCode ierr;
3047 
3048   PetscFunctionBegin;
3049   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3050   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3051   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3052   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3053   PetscFunctionReturn(0);
3054 }
3055 
3056 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3057 {
3058   PetscErrorCode              ierr;
3059   PetscSplitCSRDataStructure  *d_mat = NULL;
3060 
3061   PetscFunctionBegin;
3062   if (A->factortype == MAT_FACTOR_NONE) {
3063     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3064     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3065     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3066   } else {
3067     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3068   }
3069   if (d_mat) {
3070     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3071     cudaError_t                err;
3072     PetscSplitCSRDataStructure h_mat;
3073     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3074     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3075     if (a->compressedrow.use) {
3076       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3077     }
3078     err = cudaFree(d_mat);CHKERRCUDA(err);
3079   }
3080   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3081   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3082   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3083   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3084   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3085   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3086   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3087   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3088   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3089   PetscFunctionReturn(0);
3090 }
3091 
3092 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3093 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3094 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3095 {
3096   PetscErrorCode ierr;
3097 
3098   PetscFunctionBegin;
3099   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3100   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3101   PetscFunctionReturn(0);
3102 }
3103 
3104 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3105 {
3106   PetscErrorCode     ierr;
3107   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3108   Mat_SeqAIJCUSPARSE *cy;
3109   Mat_SeqAIJCUSPARSE *cx;
3110   PetscScalar        *ay;
3111   const PetscScalar  *ax;
3112   CsrMatrix          *csry,*csrx;
3113 
3114   PetscFunctionBegin;
3115   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3116   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3117   if (X->ops->axpy != Y->ops->axpy) {
3118     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3119     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3120     PetscFunctionReturn(0);
3121   }
3122   /* if we are here, it means both matrices are bound to GPU */
3123   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3124   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3125   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3126   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3127   csry = (CsrMatrix*)cy->mat->mat;
3128   csrx = (CsrMatrix*)cx->mat->mat;
3129   /* see if we can turn this into a cublas axpy */
3130   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3131     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3132     if (eq) {
3133       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3134     }
3135     if (eq) str = SAME_NONZERO_PATTERN;
3136   }
3137   /* spgeam is buggy with one column */
3138   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3139 
3140   if (str == SUBSET_NONZERO_PATTERN) {
3141     cusparseStatus_t stat;
3142     PetscScalar      b = 1.0;
3143 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3144     size_t           bufferSize;
3145     void             *buffer;
3146     cudaError_t      cerr;
3147 #endif
3148 
3149     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3150     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3151     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3152 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3153     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3154                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3155                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3156                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3157     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3158     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3159     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3160                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3161                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3162                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3163     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3164     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3165     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3166 #else
3167     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3168     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3169                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3170                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3171                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3172     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3173     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3174 #endif
3175     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3176     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3177     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3178     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3179   } else if (str == SAME_NONZERO_PATTERN) {
3180     cublasHandle_t cublasv2handle;
3181     cublasStatus_t berr;
3182     PetscBLASInt   one = 1, bnz = 1;
3183 
3184     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3185     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3186     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3187     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3188     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3189     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3190     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3191     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3192     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3193     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3194     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3195   } else {
3196     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3197     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3198   }
3199   PetscFunctionReturn(0);
3200 }
3201 
3202 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3203 {
3204   PetscErrorCode ierr;
3205   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3206   PetscScalar    *ay;
3207   cublasHandle_t cublasv2handle;
3208   cublasStatus_t berr;
3209   PetscBLASInt   one = 1, bnz = 1;
3210 
3211   PetscFunctionBegin;
3212   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3213   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3214   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3215   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3216   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3217   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3218   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3219   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3220   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3221   PetscFunctionReturn(0);
3222 }
3223 
3224 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3225 {
3226   PetscErrorCode             ierr;
3227   PetscBool                  both = PETSC_FALSE;
3228   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3229 
3230   PetscFunctionBegin;
3231   if (A->factortype == MAT_FACTOR_NONE) {
3232     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3233     if (spptr->mat) {
3234       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3235       if (matrix->values) {
3236         both = PETSC_TRUE;
3237         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3238       }
3239     }
3240     if (spptr->matTranspose) {
3241       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3242       if (matrix->values) {
3243         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3244       }
3245     }
3246   }
3247   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3248   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3249   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3250   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3251   else A->offloadmask = PETSC_OFFLOAD_CPU;
3252 
3253   PetscFunctionReturn(0);
3254 }
3255 
3256 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3257 {
3258   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3259   PetscErrorCode ierr;
3260 
3261   PetscFunctionBegin;
3262   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3263   if (flg) {
3264     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3265 
3266     A->ops->scale                     = MatScale_SeqAIJ;
3267     A->ops->axpy                      = MatAXPY_SeqAIJ;
3268     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3269     A->ops->mult                      = MatMult_SeqAIJ;
3270     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3271     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3272     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3273     A->ops->multhermitiantranspose    = NULL;
3274     A->ops->multhermitiantransposeadd = NULL;
3275     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3276     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3277     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3278     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3279     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3280     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3281     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3282     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3283   } else {
3284     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3285     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3286     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3287     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3288     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3289     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3290     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3291     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3292     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3293     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3294     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3295     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3296     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3297     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3298     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3299     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3300     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3301   }
3302   A->boundtocpu = flg;
3303   a->inode.use = flg;
3304   PetscFunctionReturn(0);
3305 }
3306 
3307 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3308 {
3309   PetscErrorCode   ierr;
3310   cusparseStatus_t stat;
3311   Mat              B;
3312 
3313   PetscFunctionBegin;
3314   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3315   if (reuse == MAT_INITIAL_MATRIX) {
3316     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3317   } else if (reuse == MAT_REUSE_MATRIX) {
3318     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3319   }
3320   B = *newmat;
3321 
3322   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3323   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3324 
3325   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3326     if (B->factortype == MAT_FACTOR_NONE) {
3327       Mat_SeqAIJCUSPARSE *spptr;
3328       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3329       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3330       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3331       spptr->format     = MAT_CUSPARSE_CSR;
3332      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3333       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3334       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3335       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3336      #endif
3337       B->spptr = spptr;
3338     } else {
3339       Mat_SeqAIJCUSPARSETriFactors *spptr;
3340 
3341       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3342       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3343       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3344       B->spptr = spptr;
3345     }
3346     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3347   }
3348   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3349   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3350   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
3351   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3352   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3353   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3354 
3355   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3356   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3357   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3358   PetscFunctionReturn(0);
3359 }
3360 
3361 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3362 {
3363   PetscErrorCode ierr;
3364 
3365   PetscFunctionBegin;
3366   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3367   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3368   PetscFunctionReturn(0);
3369 }
3370 
3371 /*MC
3372    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3373 
3374    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3375    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3376    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3377 
3378    Options Database Keys:
3379 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3380 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3381 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3382 
3383   Level: beginner
3384 
3385 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3386 M*/
3387 
3388 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
3389 
3390 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3391 {
3392   PetscErrorCode ierr;
3393 
3394   PetscFunctionBegin;
3395   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
3396   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3397   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3398   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3399   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3400 
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3405 {
3406   PetscErrorCode   ierr;
3407   cusparseStatus_t stat;
3408 
3409   PetscFunctionBegin;
3410   if (*cusparsestruct) {
3411     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3412     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3413     delete (*cusparsestruct)->workVector;
3414     delete (*cusparsestruct)->rowoffsets_gpu;
3415     delete (*cusparsestruct)->cooPerm;
3416     delete (*cusparsestruct)->cooPerm_a;
3417     delete (*cusparsestruct)->csr2csc_i;
3418     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3419     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3420   }
3421   PetscFunctionReturn(0);
3422 }
3423 
3424 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3425 {
3426   PetscFunctionBegin;
3427   if (*mat) {
3428     delete (*mat)->values;
3429     delete (*mat)->column_indices;
3430     delete (*mat)->row_offsets;
3431     delete *mat;
3432     *mat = 0;
3433   }
3434   PetscFunctionReturn(0);
3435 }
3436 
3437 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3438 {
3439   cusparseStatus_t stat;
3440   PetscErrorCode   ierr;
3441 
3442   PetscFunctionBegin;
3443   if (*trifactor) {
3444     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3445     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3446     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3447     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3448     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3449    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3450     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3451    #endif
3452     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3453   }
3454   PetscFunctionReturn(0);
3455 }
3456 
3457 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3458 {
3459   CsrMatrix        *mat;
3460   cusparseStatus_t stat;
3461   cudaError_t      err;
3462 
3463   PetscFunctionBegin;
3464   if (*matstruct) {
3465     if ((*matstruct)->mat) {
3466       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3467        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3468         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3469        #else
3470         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3471         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3472        #endif
3473       } else {
3474         mat = (CsrMatrix*)(*matstruct)->mat;
3475         CsrMatrix_Destroy(&mat);
3476       }
3477     }
3478     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3479     delete (*matstruct)->cprowIndices;
3480     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3481     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3482     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3483 
3484    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3485     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3486     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3487     for (int i=0; i<3; i++) {
3488       if (mdata->cuSpMV[i].initialized) {
3489         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3490         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3491         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3492       }
3493     }
3494    #endif
3495     delete *matstruct;
3496     *matstruct = NULL;
3497   }
3498   PetscFunctionReturn(0);
3499 }
3500 
3501 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
3502 {
3503   PetscErrorCode ierr;
3504 
3505   PetscFunctionBegin;
3506   if (*trifactors) {
3507     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3508     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3509     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3510     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3511     delete (*trifactors)->rpermIndices;
3512     delete (*trifactors)->cpermIndices;
3513     delete (*trifactors)->workVector;
3514     (*trifactors)->rpermIndices = NULL;
3515     (*trifactors)->cpermIndices = NULL;
3516     (*trifactors)->workVector = NULL;
3517     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3518     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3519     (*trifactors)->init_dev_prop = PETSC_FALSE;
3520   }
3521   PetscFunctionReturn(0);
3522 }
3523 
3524 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3525 {
3526   PetscErrorCode   ierr;
3527   cusparseHandle_t handle;
3528   cusparseStatus_t stat;
3529 
3530   PetscFunctionBegin;
3531   if (*trifactors) {
3532     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3533     if (handle = (*trifactors)->handle) {
3534       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3535     }
3536     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3537   }
3538   PetscFunctionReturn(0);
3539 }
3540 
3541 struct IJCompare
3542 {
3543   __host__ __device__
3544   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3545   {
3546     if (t1.get<0>() < t2.get<0>()) return true;
3547     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3548     return false;
3549   }
3550 };
3551 
3552 struct IJEqual
3553 {
3554   __host__ __device__
3555   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3556   {
3557     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3558     return true;
3559   }
3560 };
3561 
3562 struct IJDiff
3563 {
3564   __host__ __device__
3565   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3566   {
3567     return t1 == t2 ? 0 : 1;
3568   }
3569 };
3570 
3571 struct IJSum
3572 {
3573   __host__ __device__
3574   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3575   {
3576     return t1||t2;
3577   }
3578 };
3579 
3580 #include <thrust/iterator/discard_iterator.h>
3581 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3582 {
3583   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3584   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3585   THRUSTARRAY                           *cooPerm_v = NULL;
3586   thrust::device_ptr<const PetscScalar> d_v;
3587   CsrMatrix                             *matrix;
3588   PetscErrorCode                        ierr;
3589   PetscInt                              n;
3590 
3591   PetscFunctionBegin;
3592   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3593   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3594   if (!cusp->cooPerm) {
3595     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3596     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3597     PetscFunctionReturn(0);
3598   }
3599   matrix = (CsrMatrix*)cusp->mat->mat;
3600   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3601   if (!v) {
3602     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3603     goto finalize;
3604   }
3605   n = cusp->cooPerm->size();
3606   if (isCudaMem(v)) {
3607     d_v = thrust::device_pointer_cast(v);
3608   } else {
3609     cooPerm_v = new THRUSTARRAY(n);
3610     cooPerm_v->assign(v,v+n);
3611     d_v = cooPerm_v->data();
3612     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3613   }
3614   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3615   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3616     if (cusp->cooPerm_a) {
3617       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3618       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3619       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3620       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3621       delete cooPerm_w;
3622     } else {
3623       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3624                                                                 matrix->values->begin()));
3625       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3626                                                                 matrix->values->end()));
3627       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3628     }
3629   } else {
3630     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3631       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3632       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3633     } else {
3634       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3635                                                                 matrix->values->begin()));
3636       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3637                                                                 matrix->values->end()));
3638       thrust::for_each(zibit,zieit,VecCUDAEquals());
3639     }
3640   }
3641   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3642 finalize:
3643   delete cooPerm_v;
3644   A->offloadmask = PETSC_OFFLOAD_GPU;
3645   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3646   /* shorter version of MatAssemblyEnd_SeqAIJ */
3647   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3648   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3649   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3650   a->reallocs         = 0;
3651   A->info.mallocs    += 0;
3652   A->info.nz_unneeded = 0;
3653   A->assembled = A->was_assembled = PETSC_TRUE;
3654   A->num_ass++;
3655   PetscFunctionReturn(0);
3656 }
3657 
3658 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3659 {
3660   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3661   PetscErrorCode     ierr;
3662 
3663   PetscFunctionBegin;
3664   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3665   if (!cusp) PetscFunctionReturn(0);
3666   if (destroy) {
3667     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3668     delete cusp->csr2csc_i;
3669     cusp->csr2csc_i = NULL;
3670   }
3671   A->transupdated = PETSC_FALSE;
3672   PetscFunctionReturn(0);
3673 }
3674 
3675 #include <thrust/binary_search.h>
3676 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3677 {
3678   PetscErrorCode     ierr;
3679   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3680   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3681   PetscInt           cooPerm_n, nzr = 0;
3682   cudaError_t        cerr;
3683 
3684   PetscFunctionBegin;
3685   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3686   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3687   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3688   if (n != cooPerm_n) {
3689     delete cusp->cooPerm;
3690     delete cusp->cooPerm_a;
3691     cusp->cooPerm = NULL;
3692     cusp->cooPerm_a = NULL;
3693   }
3694   if (n) {
3695     THRUSTINTARRAY d_i(n);
3696     THRUSTINTARRAY d_j(n);
3697     THRUSTINTARRAY ii(A->rmap->n);
3698 
3699     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3700     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3701 
3702     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3703     d_i.assign(coo_i,coo_i+n);
3704     d_j.assign(coo_j,coo_j+n);
3705     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3706     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3707 
3708     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3709     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3710     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3711     *cusp->cooPerm_a = d_i;
3712     THRUSTINTARRAY w = d_j;
3713 
3714     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3715     if (nekey == ekey) { /* all entries are unique */
3716       delete cusp->cooPerm_a;
3717       cusp->cooPerm_a = NULL;
3718     } else { /* I couldn't come up with a more elegant algorithm */
3719       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3720       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3721       (*cusp->cooPerm_a)[0] = 0;
3722       w[0] = 0;
3723       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3724       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3725     }
3726     thrust::counting_iterator<PetscInt> search_begin(0);
3727     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3728                         search_begin, search_begin + A->rmap->n,
3729                         ii.begin());
3730     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3731 
3732     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3733     a->singlemalloc = PETSC_FALSE;
3734     a->free_a       = PETSC_TRUE;
3735     a->free_ij      = PETSC_TRUE;
3736     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3737     a->i[0] = 0;
3738     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3739     a->nz = a->maxnz = a->i[A->rmap->n];
3740     a->rmax = 0;
3741     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3742     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3743     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3744     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3745     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3746     for (PetscInt i = 0; i < A->rmap->n; i++) {
3747       const PetscInt nnzr = a->i[i+1] - a->i[i];
3748       nzr += (PetscInt)!!(nnzr);
3749       a->ilen[i] = a->imax[i] = nnzr;
3750       a->rmax = PetscMax(a->rmax,nnzr);
3751     }
3752     a->nonzerorowcnt = nzr;
3753     A->preallocated = PETSC_TRUE;
3754     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3755     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3756   } else {
3757     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3758   }
3759   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3760 
3761   /* We want to allocate the CUSPARSE struct for matvec now.
3762      The code is so convoluted now that I prefer to copy zeros */
3763   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3764   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3765   A->offloadmask = PETSC_OFFLOAD_CPU;
3766   A->nonzerostate++;
3767   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3768   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3769 
3770   A->assembled = PETSC_FALSE;
3771   A->was_assembled = PETSC_FALSE;
3772   PetscFunctionReturn(0);
3773 }
3774 
3775 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3776 {
3777   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3778   CsrMatrix          *csr;
3779   PetscErrorCode     ierr;
3780 
3781   PetscFunctionBegin;
3782   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3783   PetscValidPointer(a,2);
3784   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3785   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3786   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3787   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3788   csr = (CsrMatrix*)cusp->mat->mat;
3789   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3790   *a = csr->values->data().get();
3791   PetscFunctionReturn(0);
3792 }
3793 
3794 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3795 {
3796   PetscFunctionBegin;
3797   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3798   PetscValidPointer(a,2);
3799   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3800   *a = NULL;
3801   PetscFunctionReturn(0);
3802 }
3803 
3804 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3805 {
3806   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3807   CsrMatrix          *csr;
3808   PetscErrorCode     ierr;
3809 
3810   PetscFunctionBegin;
3811   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3812   PetscValidPointer(a,2);
3813   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3814   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3815   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3816   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3817   csr = (CsrMatrix*)cusp->mat->mat;
3818   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3819   *a = csr->values->data().get();
3820   A->offloadmask = PETSC_OFFLOAD_GPU;
3821   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3822   PetscFunctionReturn(0);
3823 }
3824 
3825 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3826 {
3827   PetscErrorCode ierr;
3828 
3829   PetscFunctionBegin;
3830   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3831   PetscValidPointer(a,2);
3832   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3833   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3834   *a = NULL;
3835   PetscFunctionReturn(0);
3836 }
3837 
3838 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3839 {
3840   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3841   CsrMatrix          *csr;
3842   PetscErrorCode     ierr;
3843 
3844   PetscFunctionBegin;
3845   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3846   PetscValidPointer(a,2);
3847   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3848   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3849   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3850   csr = (CsrMatrix*)cusp->mat->mat;
3851   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3852   *a = csr->values->data().get();
3853   A->offloadmask = PETSC_OFFLOAD_GPU;
3854   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3855   PetscFunctionReturn(0);
3856 }
3857 
3858 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3859 {
3860   PetscErrorCode ierr;
3861 
3862   PetscFunctionBegin;
3863   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3864   PetscValidPointer(a,2);
3865   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3866   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3867   *a = NULL;
3868   PetscFunctionReturn(0);
3869 }
3870 
3871 struct IJCompare4
3872 {
3873   __host__ __device__
3874   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3875   {
3876     if (t1.get<0>() < t2.get<0>()) return true;
3877     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3878     return false;
3879   }
3880 };
3881 
3882 struct Shift
3883 {
3884   int _shift;
3885 
3886   Shift(int shift) : _shift(shift) {}
3887   __host__ __device__
3888   inline int operator() (const int &c)
3889   {
3890     return c + _shift;
3891   }
3892 };
3893 
3894 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3895 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3896 {
3897   PetscErrorCode               ierr;
3898   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3899   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3900   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3901   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3902   PetscInt                     Annz,Bnnz;
3903   cusparseStatus_t             stat;
3904   PetscInt                     i,m,n,zero = 0;
3905   cudaError_t                  cerr;
3906 
3907   PetscFunctionBegin;
3908   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3909   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3910   PetscValidPointer(C,4);
3911   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3912   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3913   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3914   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3915   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3916   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3917   if (reuse == MAT_INITIAL_MATRIX) {
3918     m     = A->rmap->n;
3919     n     = A->cmap->n + B->cmap->n;
3920     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3921     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3922     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3923     c     = (Mat_SeqAIJ*)(*C)->data;
3924     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3925     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3926     Ccsr  = new CsrMatrix;
3927     Cmat->cprowIndices      = NULL;
3928     c->compressedrow.use    = PETSC_FALSE;
3929     c->compressedrow.nrows  = 0;
3930     c->compressedrow.i      = NULL;
3931     c->compressedrow.rindex = NULL;
3932     Ccusp->workVector       = NULL;
3933     Ccusp->nrows    = m;
3934     Ccusp->mat      = Cmat;
3935     Ccusp->mat->mat = Ccsr;
3936     Ccsr->num_rows  = m;
3937     Ccsr->num_cols  = n;
3938     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3939     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3940     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3941     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3942     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3943     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3944     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3945     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3946     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3947     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3948     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3949     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3950     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3951     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3952     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3953 
3954     Acsr = (CsrMatrix*)Acusp->mat->mat;
3955     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3956     Annz = (PetscInt)Acsr->column_indices->size();
3957     Bnnz = (PetscInt)Bcsr->column_indices->size();
3958     c->nz = Annz + Bnnz;
3959     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3960     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3961     Ccsr->values = new THRUSTARRAY(c->nz);
3962     Ccsr->num_entries = c->nz;
3963     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3964     if (c->nz) {
3965       auto Acoo = new THRUSTINTARRAY32(Annz);
3966       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3967       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3968       THRUSTINTARRAY32 *Aroff,*Broff;
3969 
3970       if (a->compressedrow.use) { /* need full row offset */
3971         if (!Acusp->rowoffsets_gpu) {
3972           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3973           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3974           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3975         }
3976         Aroff = Acusp->rowoffsets_gpu;
3977       } else Aroff = Acsr->row_offsets;
3978       if (b->compressedrow.use) { /* need full row offset */
3979         if (!Bcusp->rowoffsets_gpu) {
3980           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3981           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3982           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3983         }
3984         Broff = Bcusp->rowoffsets_gpu;
3985       } else Broff = Bcsr->row_offsets;
3986       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3987       stat = cusparseXcsr2coo(Acusp->handle,
3988                               Aroff->data().get(),
3989                               Annz,
3990                               m,
3991                               Acoo->data().get(),
3992                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3993       stat = cusparseXcsr2coo(Bcusp->handle,
3994                               Broff->data().get(),
3995                               Bnnz,
3996                               m,
3997                               Bcoo->data().get(),
3998                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3999       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
4000       auto Aperm = thrust::make_constant_iterator(1);
4001       auto Bperm = thrust::make_constant_iterator(0);
4002 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4003       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4004       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
4005 #else
4006       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
4007       auto Bcib = Bcsr->column_indices->begin();
4008       auto Bcie = Bcsr->column_indices->end();
4009       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
4010 #endif
4011       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
4012       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
4013       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
4014       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
4015       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
4016       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4017       auto p1 = Ccusp->cooPerm->begin();
4018       auto p2 = Ccusp->cooPerm->begin();
4019       thrust::advance(p2,Annz);
4020       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
4021 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
4022       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
4023 #endif
4024       auto cci = thrust::make_counting_iterator(zero);
4025       auto cce = thrust::make_counting_iterator(c->nz);
4026 #if 0 //Errors on SUMMIT cuda 11.1.0
4027       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
4028 #else
4029       auto pred = thrust::identity<int>();
4030       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4031       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4032 #endif
4033       stat = cusparseXcoo2csr(Ccusp->handle,
4034                               Ccoo->data().get(),
4035                               c->nz,
4036                               m,
4037                               Ccsr->row_offsets->data().get(),
4038                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4039       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4040       delete wPerm;
4041       delete Acoo;
4042       delete Bcoo;
4043       delete Ccoo;
4044 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4045       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4046                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4047                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4048                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4049 #endif
4050       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4051         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4052         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4053         CsrMatrix *CcsrT = new CsrMatrix;
4054         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4055         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4056 
4057         (*C)->form_explicit_transpose = PETSC_TRUE;
4058         (*C)->transupdated = PETSC_TRUE;
4059         Ccusp->rowoffsets_gpu = NULL;
4060         CmatT->cprowIndices = NULL;
4061         CmatT->mat = CcsrT;
4062         CcsrT->num_rows = n;
4063         CcsrT->num_cols = m;
4064         CcsrT->num_entries = c->nz;
4065 
4066         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4067         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4068         CcsrT->values = new THRUSTARRAY(c->nz);
4069 
4070         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4071         auto rT = CcsrT->row_offsets->begin();
4072         if (AT) {
4073           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4074           thrust::advance(rT,-1);
4075         }
4076         if (BT) {
4077           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4078           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4079           thrust::copy(titb,tite,rT);
4080         }
4081         auto cT = CcsrT->column_indices->begin();
4082         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4083         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4084         auto vT = CcsrT->values->begin();
4085         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4086         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4087         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4088 
4089         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4090         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4091         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4092         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4093         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4094         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4095         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4096         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4097         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4098 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4099         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4100                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4101                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4102                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4103 #endif
4104         Ccusp->matTranspose = CmatT;
4105       }
4106     }
4107 
4108     c->singlemalloc = PETSC_FALSE;
4109     c->free_a       = PETSC_TRUE;
4110     c->free_ij      = PETSC_TRUE;
4111     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4112     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4113     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4114       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4115       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4116       ii   = *Ccsr->row_offsets;
4117       jj   = *Ccsr->column_indices;
4118       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4119       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4120     } else {
4121       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4122       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4123     }
4124     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4125     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4126     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4127     c->maxnz = c->nz;
4128     c->nonzerorowcnt = 0;
4129     c->rmax = 0;
4130     for (i = 0; i < m; i++) {
4131       const PetscInt nn = c->i[i+1] - c->i[i];
4132       c->ilen[i] = c->imax[i] = nn;
4133       c->nonzerorowcnt += (PetscInt)!!nn;
4134       c->rmax = PetscMax(c->rmax,nn);
4135     }
4136     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4137     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4138     (*C)->nonzerostate++;
4139     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4140     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4141     Ccusp->nonzerostate = (*C)->nonzerostate;
4142     (*C)->preallocated  = PETSC_TRUE;
4143   } else {
4144     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4145     c = (Mat_SeqAIJ*)(*C)->data;
4146     if (c->nz) {
4147       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4148       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4149       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4150       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4151       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4152       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4153       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4154       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4155       Acsr = (CsrMatrix*)Acusp->mat->mat;
4156       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4157       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4158       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4159       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4160       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4161       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4162       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4163       auto pmid = Ccusp->cooPerm->begin();
4164       thrust::advance(pmid,Acsr->num_entries);
4165       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4166       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4167                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4168       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4169                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4170       thrust::for_each(zibait,zieait,VecCUDAEquals());
4171       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4172                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4173       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4174                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4175       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4176       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4177       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4178         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4179         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4180         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4181         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4182         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4183         auto vT = CcsrT->values->begin();
4184         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4185         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4186         (*C)->transupdated = PETSC_TRUE;
4187       }
4188       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4189     }
4190   }
4191   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4192   (*C)->assembled     = PETSC_TRUE;
4193   (*C)->was_assembled = PETSC_FALSE;
4194   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4195   PetscFunctionReturn(0);
4196 }
4197 
4198 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4199 {
4200   PetscErrorCode    ierr;
4201   bool              dmem;
4202   const PetscScalar *av;
4203   cudaError_t       cerr;
4204 
4205   PetscFunctionBegin;
4206   dmem = isCudaMem(v);
4207   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4208   if (n && idx) {
4209     THRUSTINTARRAY widx(n);
4210     widx.assign(idx,idx+n);
4211     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4212 
4213     THRUSTARRAY *w = NULL;
4214     thrust::device_ptr<PetscScalar> dv;
4215     if (dmem) {
4216       dv = thrust::device_pointer_cast(v);
4217     } else {
4218       w = new THRUSTARRAY(n);
4219       dv = w->data();
4220     }
4221     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4222 
4223     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4224     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4225     thrust::for_each(zibit,zieit,VecCUDAEquals());
4226     if (w) {
4227       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4228     }
4229     delete w;
4230   } else {
4231     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4232   }
4233   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4234   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4235   PetscFunctionReturn(0);
4236 }
4237