xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 53efea5c680dffccb0b7659435e1dc1aa9e95c12)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83 
84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
87 
88 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
89 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
90 
91 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92 
93 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94 {
95   cusparseStatus_t   stat;
96   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97 
98   PetscFunctionBegin;
99   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100   cusparsestruct->stream = stream;
101   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102   PetscFunctionReturn(0);
103 }
104 
105 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106 {
107   cusparseStatus_t   stat;
108   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109 
110   PetscFunctionBegin;
111   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
112   if (cusparsestruct->handle != handle) {
113     if (cusparsestruct->handle) {
114       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
115     }
116     cusparsestruct->handle = handle;
117   }
118   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123 {
124   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
125   PetscBool          flg;
126   PetscErrorCode     ierr;
127 
128   PetscFunctionBegin;
129   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
130   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132   PetscFunctionReturn(0);
133 }
134 
135 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
136 {
137   PetscFunctionBegin;
138   *type = MATSOLVERCUSPARSE;
139   PetscFunctionReturn(0);
140 }
141 
142 /*MC
143   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148   algorithms are not recommended. This class does NOT support direct solver operations.
149 
150   Level: beginner
151 
152 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153 M*/
154 
155 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
156 {
157   PetscErrorCode ierr;
158   PetscInt       n = A->rmap->n;
159 
160   PetscFunctionBegin;
161   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
163   (*B)->factortype = ftype;
164   (*B)->useordering = PETSC_TRUE;
165   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
166 
167   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
168     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
169     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
170     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
172     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
173     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
174   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
175 
176   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
177   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
178   PetscFunctionReturn(0);
179 }
180 
181 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
182 {
183   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
184 
185   PetscFunctionBegin;
186   switch (op) {
187   case MAT_CUSPARSE_MULT:
188     cusparsestruct->format = format;
189     break;
190   case MAT_CUSPARSE_ALL:
191     cusparsestruct->format = format;
192     break;
193   default:
194     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
195   }
196   PetscFunctionReturn(0);
197 }
198 
199 /*@
200    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
201    operation. Only the MatMult operation can use different GPU storage formats
202    for MPIAIJCUSPARSE matrices.
203    Not Collective
204 
205    Input Parameters:
206 +  A - Matrix of type SEQAIJCUSPARSE
207 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
208 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
209 
210    Output Parameter:
211 
212    Level: intermediate
213 
214 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
215 @*/
216 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
217 {
218   PetscErrorCode ierr;
219 
220   PetscFunctionBegin;
221   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
222   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
223   PetscFunctionReturn(0);
224 }
225 
226 /*@
227    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
228 
229    Collective on mat
230 
231    Input Parameters:
232 +  A - Matrix of type SEQAIJCUSPARSE
233 -  transgen - the boolean flag
234 
235    Level: intermediate
236 
237 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
238 @*/
239 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
240 {
241   PetscErrorCode ierr;
242   PetscBool      flg;
243 
244   PetscFunctionBegin;
245   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
246   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
247   if (flg) {
248     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
249 
250     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
251     cusp->transgen = transgen;
252     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
253       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
254     }
255   }
256   PetscFunctionReturn(0);
257 }
258 
259 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
260 {
261   PetscErrorCode           ierr;
262   MatCUSPARSEStorageFormat format;
263   PetscBool                flg;
264   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
265 
266   PetscFunctionBegin;
267   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
268   if (A->factortype == MAT_FACTOR_NONE) {
269     PetscBool transgen = cusparsestruct->transgen;
270 
271     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
273 
274     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
275                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
277 
278     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
279                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
280     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
281    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
282     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
283     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
284                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
285     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
286     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
287 
288     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
289     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
290                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
291     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
292 
293     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
294     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
295                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
296     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
297    #endif
298   }
299   ierr = PetscOptionsTail();CHKERRQ(ierr);
300   PetscFunctionReturn(0);
301 }
302 
303 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
304 {
305   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
306   PetscErrorCode               ierr;
307 
308   PetscFunctionBegin;
309   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
310   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
311   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
312   PetscFunctionReturn(0);
313 }
314 
315 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
316 {
317   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
318   PetscErrorCode               ierr;
319 
320   PetscFunctionBegin;
321   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
322   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
323   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
324   PetscFunctionReturn(0);
325 }
326 
327 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
328 {
329   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
330   PetscErrorCode               ierr;
331 
332   PetscFunctionBegin;
333   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
334   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
335   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
336   PetscFunctionReturn(0);
337 }
338 
339 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
340 {
341   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
342   PetscErrorCode               ierr;
343 
344   PetscFunctionBegin;
345   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
346   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
347   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
348   PetscFunctionReturn(0);
349 }
350 
351 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
352 {
353   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
354   PetscInt                          n = A->rmap->n;
355   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
356   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
357   cusparseStatus_t                  stat;
358   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
359   const MatScalar                   *aa = a->a,*v;
360   PetscInt                          *AiLo, *AjLo;
361   PetscInt                          i,nz, nzLower, offset, rowOffset;
362   PetscErrorCode                    ierr;
363   cudaError_t                       cerr;
364 
365   PetscFunctionBegin;
366   if (!n) PetscFunctionReturn(0);
367   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
368     try {
369       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
370       nzLower=n+ai[n]-ai[1];
371       if (!loTriFactor) {
372         PetscScalar                       *AALo;
373 
374         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
375 
376         /* Allocate Space for the lower triangular matrix */
377         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
378         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
379 
380         /* Fill the lower triangular matrix */
381         AiLo[0]  = (PetscInt) 0;
382         AiLo[n]  = nzLower;
383         AjLo[0]  = (PetscInt) 0;
384         AALo[0]  = (MatScalar) 1.0;
385         v        = aa;
386         vi       = aj;
387         offset   = 1;
388         rowOffset= 1;
389         for (i=1; i<n; i++) {
390           nz = ai[i+1] - ai[i];
391           /* additional 1 for the term on the diagonal */
392           AiLo[i]    = rowOffset;
393           rowOffset += nz+1;
394 
395           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
396           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
397 
398           offset      += nz;
399           AjLo[offset] = (PetscInt) i;
400           AALo[offset] = (MatScalar) 1.0;
401           offset      += 1;
402 
403           v  += nz;
404           vi += nz;
405         }
406 
407         /* allocate space for the triangular factor information */
408         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
409         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
410         /* Create the matrix description */
411         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
412         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
413        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
414         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
415        #else
416         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
417        #endif
418         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
419         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
420 
421         /* set the operation */
422         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
423 
424         /* set the matrix */
425         loTriFactor->csrMat = new CsrMatrix;
426         loTriFactor->csrMat->num_rows = n;
427         loTriFactor->csrMat->num_cols = n;
428         loTriFactor->csrMat->num_entries = nzLower;
429 
430         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
431         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
432 
433         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
434         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
435 
436         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
437         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
438 
439         /* Create the solve analysis information */
440         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
441         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
442       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
443         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
444                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
445                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
446                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
447                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
448         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
449       #endif
450 
451         /* perform the solve analysis */
452         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
453                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
454                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
455                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
456                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
458                                #endif
459 );CHKERRCUSPARSE(stat);
460         cerr = WaitForCUDA();CHKERRCUDA(cerr);
461         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
462 
463         /* assign the pointer */
464         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
465         loTriFactor->AA_h = AALo;
466         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
467         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
468         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
469       } else { /* update values only */
470         if (!loTriFactor->AA_h) {
471           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
472         }
473         /* Fill the lower triangular matrix */
474         loTriFactor->AA_h[0]  = 1.0;
475         v        = aa;
476         vi       = aj;
477         offset   = 1;
478         for (i=1; i<n; i++) {
479           nz = ai[i+1] - ai[i];
480           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
481           offset      += nz;
482           loTriFactor->AA_h[offset] = 1.0;
483           offset      += 1;
484           v  += nz;
485         }
486         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
487         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488       }
489     } catch(char *ex) {
490       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
491     }
492   }
493   PetscFunctionReturn(0);
494 }
495 
496 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
497 {
498   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
499   PetscInt                          n = A->rmap->n;
500   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
501   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
502   cusparseStatus_t                  stat;
503   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
504   const MatScalar                   *aa = a->a,*v;
505   PetscInt                          *AiUp, *AjUp;
506   PetscInt                          i,nz, nzUpper, offset;
507   PetscErrorCode                    ierr;
508   cudaError_t                       cerr;
509 
510   PetscFunctionBegin;
511   if (!n) PetscFunctionReturn(0);
512   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
513     try {
514       /* next, figure out the number of nonzeros in the upper triangular matrix. */
515       nzUpper = adiag[0]-adiag[n];
516       if (!upTriFactor) {
517         PetscScalar *AAUp;
518 
519         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
520 
521         /* Allocate Space for the upper triangular matrix */
522         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
523         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
524 
525         /* Fill the upper triangular matrix */
526         AiUp[0]=(PetscInt) 0;
527         AiUp[n]=nzUpper;
528         offset = nzUpper;
529         for (i=n-1; i>=0; i--) {
530           v  = aa + adiag[i+1] + 1;
531           vi = aj + adiag[i+1] + 1;
532 
533           /* number of elements NOT on the diagonal */
534           nz = adiag[i] - adiag[i+1]-1;
535 
536           /* decrement the offset */
537           offset -= (nz+1);
538 
539           /* first, set the diagonal elements */
540           AjUp[offset] = (PetscInt) i;
541           AAUp[offset] = (MatScalar)1./v[nz];
542           AiUp[i]      = AiUp[i+1] - (nz+1);
543 
544           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
545           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
546         }
547 
548         /* allocate space for the triangular factor information */
549         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
550         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
551 
552         /* Create the matrix description */
553         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
554         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
555        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
556         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
557        #else
558         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
559        #endif
560         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
561         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
562 
563         /* set the operation */
564         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
565 
566         /* set the matrix */
567         upTriFactor->csrMat = new CsrMatrix;
568         upTriFactor->csrMat->num_rows = n;
569         upTriFactor->csrMat->num_cols = n;
570         upTriFactor->csrMat->num_entries = nzUpper;
571 
572         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
573         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
574 
575         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
576         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
577 
578         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
579         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
580 
581         /* Create the solve analysis information */
582         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
583         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
584       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
585         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
586                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
587                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
588                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
589                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
590         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
591       #endif
592 
593         /* perform the solve analysis */
594         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
595                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
596                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
597                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
598                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
599                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
600                                #endif
601 );CHKERRCUSPARSE(stat);
602         cerr = WaitForCUDA();CHKERRCUDA(cerr);
603         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
604 
605         /* assign the pointer */
606         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
607         upTriFactor->AA_h = AAUp;
608         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
609         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
610         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
611       } else {
612         if (!upTriFactor->AA_h) {
613           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
614         }
615         /* Fill the upper triangular matrix */
616         offset = nzUpper;
617         for (i=n-1; i>=0; i--) {
618           v  = aa + adiag[i+1] + 1;
619 
620           /* number of elements NOT on the diagonal */
621           nz = adiag[i] - adiag[i+1]-1;
622 
623           /* decrement the offset */
624           offset -= (nz+1);
625 
626           /* first, set the diagonal elements */
627           upTriFactor->AA_h[offset] = 1./v[nz];
628           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
629         }
630         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
631         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632       }
633     } catch(char *ex) {
634       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
635     }
636   }
637   PetscFunctionReturn(0);
638 }
639 
640 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
641 {
642   PetscErrorCode               ierr;
643   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
644   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
645   IS                           isrow = a->row,iscol = a->icol;
646   PetscBool                    row_identity,col_identity;
647   PetscInt                     n = A->rmap->n;
648 
649   PetscFunctionBegin;
650   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
651   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
652   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
653 
654   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
655   cusparseTriFactors->nnz=a->nz;
656 
657   A->offloadmask = PETSC_OFFLOAD_BOTH;
658   /* lower triangular indices */
659   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
660   if (!row_identity && !cusparseTriFactors->rpermIndices) {
661     const PetscInt *r;
662 
663     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
664     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
665     cusparseTriFactors->rpermIndices->assign(r, r+n);
666     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
667     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
668   }
669 
670   /* upper triangular indices */
671   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
672   if (!col_identity && !cusparseTriFactors->cpermIndices) {
673     const PetscInt *c;
674 
675     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
676     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677     cusparseTriFactors->cpermIndices->assign(c, c+n);
678     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
679     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
680   }
681   PetscFunctionReturn(0);
682 }
683 
684 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
685 {
686   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
687   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
688   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
689   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
690   cusparseStatus_t                  stat;
691   PetscErrorCode                    ierr;
692   cudaError_t                       cerr;
693   PetscInt                          *AiUp, *AjUp;
694   PetscScalar                       *AAUp;
695   PetscScalar                       *AALo;
696   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
697   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
698   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
699   const MatScalar                   *aa = b->a,*v;
700 
701   PetscFunctionBegin;
702   if (!n) PetscFunctionReturn(0);
703   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
704     try {
705       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
706       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
707       if (!upTriFactor && !loTriFactor) {
708         /* Allocate Space for the upper triangular matrix */
709         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
710         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
711 
712         /* Fill the upper triangular matrix */
713         AiUp[0]=(PetscInt) 0;
714         AiUp[n]=nzUpper;
715         offset = 0;
716         for (i=0; i<n; i++) {
717           /* set the pointers */
718           v  = aa + ai[i];
719           vj = aj + ai[i];
720           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
721 
722           /* first, set the diagonal elements */
723           AjUp[offset] = (PetscInt) i;
724           AAUp[offset] = (MatScalar)1.0/v[nz];
725           AiUp[i]      = offset;
726           AALo[offset] = (MatScalar)1.0/v[nz];
727 
728           offset+=1;
729           if (nz>0) {
730             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
731             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
732             for (j=offset; j<offset+nz; j++) {
733               AAUp[j] = -AAUp[j];
734               AALo[j] = AAUp[j]/v[nz];
735             }
736             offset+=nz;
737           }
738         }
739 
740         /* allocate space for the triangular factor information */
741         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
742         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
743 
744         /* Create the matrix description */
745         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
746         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
747        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
748         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
749        #else
750         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
751        #endif
752         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
753         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
754 
755         /* set the matrix */
756         upTriFactor->csrMat = new CsrMatrix;
757         upTriFactor->csrMat->num_rows = A->rmap->n;
758         upTriFactor->csrMat->num_cols = A->cmap->n;
759         upTriFactor->csrMat->num_entries = a->nz;
760 
761         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
762         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
763 
764         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
765         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
766 
767         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
768         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
769 
770         /* set the operation */
771         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772 
773         /* Create the solve analysis information */
774         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
775         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
776       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
778                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
779                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
780                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
781                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
782         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
783       #endif
784 
785         /* perform the solve analysis */
786         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
787                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
788                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
789                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
790                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
792                                 #endif
793 );CHKERRCUSPARSE(stat);
794         cerr = WaitForCUDA();CHKERRCUDA(cerr);
795         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796 
797         /* assign the pointer */
798         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
799 
800         /* allocate space for the triangular factor information */
801         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
802         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
803 
804         /* Create the matrix description */
805         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
806         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
807        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
808         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
809        #else
810         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
811        #endif
812         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
813         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
814 
815         /* set the operation */
816         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
817 
818         /* set the matrix */
819         loTriFactor->csrMat = new CsrMatrix;
820         loTriFactor->csrMat->num_rows = A->rmap->n;
821         loTriFactor->csrMat->num_cols = A->cmap->n;
822         loTriFactor->csrMat->num_entries = a->nz;
823 
824         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
825         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
826 
827         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
828         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
829 
830         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
831         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
832 
833         /* Create the solve analysis information */
834         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
835         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
836       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
837         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
838                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
839                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
840                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
841                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
842         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
843       #endif
844 
845         /* perform the solve analysis */
846         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
847                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
848                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
849                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
850                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
851                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
852                                 #endif
853 );CHKERRCUSPARSE(stat);
854         cerr = WaitForCUDA();CHKERRCUDA(cerr);
855         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
856 
857         /* assign the pointer */
858         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
859 
860         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
861         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
862         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
863       } else {
864         /* Fill the upper triangular matrix */
865         offset = 0;
866         for (i=0; i<n; i++) {
867           /* set the pointers */
868           v  = aa + ai[i];
869           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
870 
871           /* first, set the diagonal elements */
872           AAUp[offset] = 1.0/v[nz];
873           AALo[offset] = 1.0/v[nz];
874 
875           offset+=1;
876           if (nz>0) {
877             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
878             for (j=offset; j<offset+nz; j++) {
879               AAUp[j] = -AAUp[j];
880               AALo[j] = AAUp[j]/v[nz];
881             }
882             offset+=nz;
883           }
884         }
885         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
886         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
887         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
888         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
889         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
890       }
891       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
892       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
893     } catch(char *ex) {
894       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
895     }
896   }
897   PetscFunctionReturn(0);
898 }
899 
900 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
901 {
902   PetscErrorCode               ierr;
903   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
904   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
905   IS                           ip = a->row;
906   PetscBool                    perm_identity;
907   PetscInt                     n = A->rmap->n;
908 
909   PetscFunctionBegin;
910   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
912   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
913   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
914 
915   A->offloadmask = PETSC_OFFLOAD_BOTH;
916 
917   /* lower triangular indices */
918   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
919   if (!perm_identity) {
920     IS             iip;
921     const PetscInt *irip,*rip;
922 
923     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
924     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
925     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
926     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
927     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
928     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
929     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
930     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
931     ierr = ISDestroy(&iip);CHKERRQ(ierr);
932     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
933     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
934   }
935   PetscFunctionReturn(0);
936 }
937 
938 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
939 {
940   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
941   IS             isrow = b->row,iscol = b->col;
942   PetscBool      row_identity,col_identity;
943   PetscErrorCode ierr;
944 
945   PetscFunctionBegin;
946   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
947   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
948   B->offloadmask = PETSC_OFFLOAD_CPU;
949   /* determine which version of MatSolve needs to be used. */
950   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
951   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
952   if (row_identity && col_identity) {
953     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
954     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
955     B->ops->matsolve = NULL;
956     B->ops->matsolvetranspose = NULL;
957   } else {
958     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
959     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
960     B->ops->matsolve = NULL;
961     B->ops->matsolvetranspose = NULL;
962   }
963 
964   /* get the triangular factors */
965   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
966   PetscFunctionReturn(0);
967 }
968 
969 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
970 {
971   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
972   IS             ip = b->row;
973   PetscBool      perm_identity;
974   PetscErrorCode ierr;
975 
976   PetscFunctionBegin;
977   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
978   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
979   B->offloadmask = PETSC_OFFLOAD_CPU;
980   /* determine which version of MatSolve needs to be used. */
981   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
982   if (perm_identity) {
983     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
984     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
985     B->ops->matsolve = NULL;
986     B->ops->matsolvetranspose = NULL;
987   } else {
988     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
989     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
990     B->ops->matsolve = NULL;
991     B->ops->matsolvetranspose = NULL;
992   }
993 
994   /* get the triangular factors */
995   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
996   PetscFunctionReturn(0);
997 }
998 
999 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1000 {
1001   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1005   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1006   cusparseStatus_t                  stat;
1007   cusparseIndexBase_t               indexBase;
1008   cusparseMatrixType_t              matrixType;
1009   cusparseFillMode_t                fillMode;
1010   cusparseDiagType_t                diagType;
1011   cudaError_t                       cerr;
1012   PetscErrorCode                    ierr;
1013 
1014   PetscFunctionBegin;
1015   /* allocate space for the transpose of the lower triangular factor */
1016   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1017   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1018 
1019   /* set the matrix descriptors of the lower triangular factor */
1020   matrixType = cusparseGetMatType(loTriFactor->descr);
1021   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1022   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1023     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1024   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1025 
1026   /* Create the matrix description */
1027   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1030   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1031   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1032 
1033   /* set the operation */
1034   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1035 
1036   /* allocate GPU space for the CSC of the lower triangular factor*/
1037   loTriFactorT->csrMat = new CsrMatrix;
1038   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1039   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1040   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1041   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1042   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1043   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1044 
1045   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1046 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1048                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1049                                        loTriFactor->csrMat->values->data().get(),
1050                                        loTriFactor->csrMat->row_offsets->data().get(),
1051                                        loTriFactor->csrMat->column_indices->data().get(),
1052                                        loTriFactorT->csrMat->values->data().get(),
1053                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1054                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1055                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1056   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1057 #endif
1058 
1059   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1060   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1061                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1062                           loTriFactor->csrMat->values->data().get(),
1063                           loTriFactor->csrMat->row_offsets->data().get(),
1064                           loTriFactor->csrMat->column_indices->data().get(),
1065                           loTriFactorT->csrMat->values->data().get(),
1066                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1067                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1068                           CUSPARSE_ACTION_NUMERIC, indexBase,
1069                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1070                         #else
1071                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1072                           CUSPARSE_ACTION_NUMERIC, indexBase
1073                         #endif
1074 );CHKERRCUSPARSE(stat);
1075   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1076   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1077 
1078   /* Create the solve analysis information */
1079   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1080   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1081 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1082   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1083                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1084                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1085                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1086                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1087   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1088 #endif
1089 
1090   /* perform the solve analysis */
1091   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1093                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1095                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1096                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1097                           #endif
1098 );CHKERRCUSPARSE(stat);
1099   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1100   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1101 
1102   /* assign the pointer */
1103   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1104 
1105   /*********************************************/
1106   /* Now the Transpose of the Upper Tri Factor */
1107   /*********************************************/
1108 
1109   /* allocate space for the transpose of the upper triangular factor */
1110   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1111   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1112 
1113   /* set the matrix descriptors of the upper triangular factor */
1114   matrixType = cusparseGetMatType(upTriFactor->descr);
1115   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1116   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1117     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1118   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1119 
1120   /* Create the matrix description */
1121   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1124   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1125   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1126 
1127   /* set the operation */
1128   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1129 
1130   /* allocate GPU space for the CSC of the upper triangular factor*/
1131   upTriFactorT->csrMat = new CsrMatrix;
1132   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1133   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1134   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1135   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1136   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1137   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1138 
1139   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1140 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1141   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1142                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1143                                 upTriFactor->csrMat->values->data().get(),
1144                                 upTriFactor->csrMat->row_offsets->data().get(),
1145                                 upTriFactor->csrMat->column_indices->data().get(),
1146                                 upTriFactorT->csrMat->values->data().get(),
1147                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1148                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1149                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1150   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1151 #endif
1152 
1153   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1154   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1155                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1156                           upTriFactor->csrMat->values->data().get(),
1157                           upTriFactor->csrMat->row_offsets->data().get(),
1158                           upTriFactor->csrMat->column_indices->data().get(),
1159                           upTriFactorT->csrMat->values->data().get(),
1160                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1161                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1162                           CUSPARSE_ACTION_NUMERIC, indexBase,
1163                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1164                         #else
1165                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166                           CUSPARSE_ACTION_NUMERIC, indexBase
1167                         #endif
1168 );CHKERRCUSPARSE(stat);
1169   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1170   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1171 
1172   /* Create the solve analysis information */
1173   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1174   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1175   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1176   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1177                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1178                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1179                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1180                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1181   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1182   #endif
1183 
1184   /* perform the solve analysis */
1185   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1186                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1187                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1188                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1189                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1190                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1191                           #endif
1192 );CHKERRCUSPARSE(stat);
1193   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1194   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1195 
1196   /* assign the pointer */
1197   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1198   PetscFunctionReturn(0);
1199 }
1200 
1201 struct PetscScalarToPetscInt
1202 {
1203   __host__ __device__
1204   PetscInt operator()(PetscScalar s)
1205   {
1206     return (PetscInt)PetscRealPart(s);
1207   }
1208 };
1209 
1210 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1211 {
1212   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1213   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1214   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1215   cusparseStatus_t             stat;
1216   cusparseIndexBase_t          indexBase;
1217   cudaError_t                  err;
1218   PetscErrorCode               ierr;
1219 
1220   PetscFunctionBegin;
1221   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1222   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1223   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1224   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1225   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1226   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1227   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
1228   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1229   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1230     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1231   }
1232   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1233     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1234     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1235     indexBase = cusparseGetMatIndexBase(matstruct->descr);
1236     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1237     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1238 
1239     /* set alpha and beta */
1240     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1241     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1242     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1243     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1245     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1246 
1247     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1248       CsrMatrix *matrixT = new CsrMatrix;
1249       matstructT->mat = matrixT;
1250       matrixT->num_rows = A->cmap->n;
1251       matrixT->num_cols = A->rmap->n;
1252       matrixT->num_entries = a->nz;
1253       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1254       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1255       matrixT->values = new THRUSTARRAY(a->nz);
1256 
1257       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1258       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1259 
1260      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1261       stat = cusparseCreateCsr(&matstructT->matDescr,
1262                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1263                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1264                                matrixT->values->data().get(),
1265                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1267      #endif
1268     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1269    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1271    #else
1272       CsrMatrix *temp  = new CsrMatrix;
1273       CsrMatrix *tempT = new CsrMatrix;
1274       /* First convert HYB to CSR */
1275       temp->num_rows = A->rmap->n;
1276       temp->num_cols = A->cmap->n;
1277       temp->num_entries = a->nz;
1278       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1279       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1280       temp->values = new THRUSTARRAY(a->nz);
1281 
1282       stat = cusparse_hyb2csr(cusparsestruct->handle,
1283                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1284                               temp->values->data().get(),
1285                               temp->row_offsets->data().get(),
1286                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1287 
1288       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1289       tempT->num_rows = A->rmap->n;
1290       tempT->num_cols = A->cmap->n;
1291       tempT->num_entries = a->nz;
1292       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1294       tempT->values = new THRUSTARRAY(a->nz);
1295 
1296       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1297                               temp->num_cols, temp->num_entries,
1298                               temp->values->data().get(),
1299                               temp->row_offsets->data().get(),
1300                               temp->column_indices->data().get(),
1301                               tempT->values->data().get(),
1302                               tempT->column_indices->data().get(),
1303                               tempT->row_offsets->data().get(),
1304                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1305 
1306       /* Last, convert CSC to HYB */
1307       cusparseHybMat_t hybMat;
1308       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1309       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1310         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1312                               matstructT->descr, tempT->values->data().get(),
1313                               tempT->row_offsets->data().get(),
1314                               tempT->column_indices->data().get(),
1315                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1316 
1317       /* assign the pointer */
1318       matstructT->mat = hybMat;
1319       cusparsestruct->transupdated = PETSC_TRUE;
1320       /* delete temporaries */
1321       if (tempT) {
1322         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1323         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1324         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1325         delete (CsrMatrix*) tempT;
1326       }
1327       if (temp) {
1328         if (temp->values) delete (THRUSTARRAY*) temp->values;
1329         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1330         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1331         delete (CsrMatrix*) temp;
1332       }
1333      #endif
1334     }
1335   }
1336   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1337     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1338     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1339     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1340     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1341     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1342     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1343     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1344     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1345     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1346     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1347     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1348       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1349       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1350       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1351     }
1352     if (!cusparsestruct->csr2csc_i) {
1353       THRUSTARRAY csr2csc_a(matrix->num_entries);
1354       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1355 
1356       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1357      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1358       void   *csr2cscBuffer;
1359       size_t csr2cscBufferSize;
1360       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1361                                            A->cmap->n, matrix->num_entries,
1362                                            matrix->values->data().get(),
1363                                            cusparsestruct->rowoffsets_gpu->data().get(),
1364                                            matrix->column_indices->data().get(),
1365                                            matrixT->values->data().get(),
1366                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1367                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1368                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1369       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1370      #endif
1371 
1372       stat = cusparse_csr2csc(cusparsestruct->handle,
1373                               A->rmap->n,A->cmap->n,matrix->num_entries,
1374                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1375                               matrixT->values->data().get(),
1376                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1377                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1378                               CUSPARSE_ACTION_NUMERIC,indexBase,
1379                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1380                              #else
1381                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1382                               CUSPARSE_ACTION_NUMERIC, indexBase
1383                              #endif
1384 );CHKERRCUSPARSE(stat);
1385       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1386       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1387      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1388       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1389      #endif
1390     }
1391     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1392                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1393                                                      matrixT->values->begin()));
1394   }
1395   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1396   /* the compressed row indices is not used for matTranspose */
1397   matstructT->cprowIndices = NULL;
1398   /* assign the pointer */
1399   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1400   cusparsestruct->transupdated = PETSC_TRUE;
1401   PetscFunctionReturn(0);
1402 }
1403 
1404 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1405 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1406 {
1407   PetscInt                              n = xx->map->n;
1408   const PetscScalar                     *barray;
1409   PetscScalar                           *xarray;
1410   thrust::device_ptr<const PetscScalar> bGPU;
1411   thrust::device_ptr<PetscScalar>       xGPU;
1412   cusparseStatus_t                      stat;
1413   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1414   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1415   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1416   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1417   PetscErrorCode                        ierr;
1418   cudaError_t                           cerr;
1419 
1420   PetscFunctionBegin;
1421   /* Analyze the matrix and create the transpose ... on the fly */
1422   if (!loTriFactorT && !upTriFactorT) {
1423     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1424     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426   }
1427 
1428   /* Get the GPU pointers */
1429   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1430   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1431   xGPU = thrust::device_pointer_cast(xarray);
1432   bGPU = thrust::device_pointer_cast(barray);
1433 
1434   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1435   /* First, reorder with the row permutation */
1436   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1437                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1438                xGPU);
1439 
1440   /* First, solve U */
1441   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1442                         upTriFactorT->csrMat->num_rows,
1443                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444                         upTriFactorT->csrMat->num_entries,
1445                       #endif
1446                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1447                         upTriFactorT->csrMat->values->data().get(),
1448                         upTriFactorT->csrMat->row_offsets->data().get(),
1449                         upTriFactorT->csrMat->column_indices->data().get(),
1450                         upTriFactorT->solveInfo,
1451                         xarray, tempGPU->data().get()
1452                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1454                       #endif
1455 );CHKERRCUSPARSE(stat);
1456 
1457   /* Then, solve L */
1458   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1459                         loTriFactorT->csrMat->num_rows,
1460                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1461                         loTriFactorT->csrMat->num_entries,
1462                       #endif
1463                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1464                         loTriFactorT->csrMat->values->data().get(),
1465                         loTriFactorT->csrMat->row_offsets->data().get(),
1466                         loTriFactorT->csrMat->column_indices->data().get(),
1467                         loTriFactorT->solveInfo,
1468                         tempGPU->data().get(), xarray
1469                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1470                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1471                       #endif
1472 );CHKERRCUSPARSE(stat);
1473 
1474   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1475   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1476                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1477                tempGPU->begin());
1478 
1479   /* Copy the temporary to the full solution. */
1480   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1481 
1482   /* restore */
1483   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1484   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1485   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1486   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1487   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1488   PetscFunctionReturn(0);
1489 }
1490 
1491 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1492 {
1493   const PetscScalar                 *barray;
1494   PetscScalar                       *xarray;
1495   cusparseStatus_t                  stat;
1496   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1497   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1498   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1499   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1500   PetscErrorCode                    ierr;
1501   cudaError_t                       cerr;
1502 
1503   PetscFunctionBegin;
1504   /* Analyze the matrix and create the transpose ... on the fly */
1505   if (!loTriFactorT && !upTriFactorT) {
1506     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1507     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509   }
1510 
1511   /* Get the GPU pointers */
1512   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1513   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1514 
1515   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1516   /* First, solve U */
1517   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1518                         upTriFactorT->csrMat->num_rows,
1519                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520                         upTriFactorT->csrMat->num_entries,
1521                       #endif
1522                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1523                         upTriFactorT->csrMat->values->data().get(),
1524                         upTriFactorT->csrMat->row_offsets->data().get(),
1525                         upTriFactorT->csrMat->column_indices->data().get(),
1526                         upTriFactorT->solveInfo,
1527                         barray, tempGPU->data().get()
1528                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1529                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1530                       #endif
1531 );CHKERRCUSPARSE(stat);
1532 
1533   /* Then, solve L */
1534   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1535                         loTriFactorT->csrMat->num_rows,
1536                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537                         loTriFactorT->csrMat->num_entries,
1538                       #endif
1539                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1540                         loTriFactorT->csrMat->values->data().get(),
1541                         loTriFactorT->csrMat->row_offsets->data().get(),
1542                         loTriFactorT->csrMat->column_indices->data().get(),
1543                         loTriFactorT->solveInfo,
1544                         tempGPU->data().get(), xarray
1545                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1547                       #endif
1548 );CHKERRCUSPARSE(stat);
1549 
1550   /* restore */
1551   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1552   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1553   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1554   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1555   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1556   PetscFunctionReturn(0);
1557 }
1558 
1559 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1560 {
1561   const PetscScalar                     *barray;
1562   PetscScalar                           *xarray;
1563   thrust::device_ptr<const PetscScalar> bGPU;
1564   thrust::device_ptr<PetscScalar>       xGPU;
1565   cusparseStatus_t                      stat;
1566   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1567   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1568   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1569   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1570   PetscErrorCode                        ierr;
1571   cudaError_t                           cerr;
1572 
1573   PetscFunctionBegin;
1574 
1575   /* Get the GPU pointers */
1576   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1577   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1578   xGPU = thrust::device_pointer_cast(xarray);
1579   bGPU = thrust::device_pointer_cast(barray);
1580 
1581   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1582   /* First, reorder with the row permutation */
1583   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1584                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1585                tempGPU->begin());
1586 
1587   /* Next, solve L */
1588   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1589                         loTriFactor->csrMat->num_rows,
1590                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591                         loTriFactor->csrMat->num_entries,
1592                       #endif
1593                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1594                         loTriFactor->csrMat->values->data().get(),
1595                         loTriFactor->csrMat->row_offsets->data().get(),
1596                         loTriFactor->csrMat->column_indices->data().get(),
1597                         loTriFactor->solveInfo,
1598                         tempGPU->data().get(), xarray
1599                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1600                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1601                       #endif
1602 );CHKERRCUSPARSE(stat);
1603 
1604   /* Then, solve U */
1605   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1606                         upTriFactor->csrMat->num_rows,
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         upTriFactor->csrMat->num_entries,
1609                       #endif
1610                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1611                         upTriFactor->csrMat->values->data().get(),
1612                         upTriFactor->csrMat->row_offsets->data().get(),
1613                         upTriFactor->csrMat->column_indices->data().get(),
1614                         upTriFactor->solveInfo,
1615                         xarray, tempGPU->data().get()
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1618                       #endif
1619 );CHKERRCUSPARSE(stat);
1620 
1621   /* Last, reorder with the column permutation */
1622   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1623                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1624                xGPU);
1625 
1626   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1627   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1628   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1629   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1630   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1635 {
1636   const PetscScalar                 *barray;
1637   PetscScalar                       *xarray;
1638   cusparseStatus_t                  stat;
1639   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1640   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1641   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1642   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1643   PetscErrorCode                    ierr;
1644   cudaError_t                       cerr;
1645 
1646   PetscFunctionBegin;
1647   /* Get the GPU pointers */
1648   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1649   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1650 
1651   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1652   /* First, solve L */
1653   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1654                         loTriFactor->csrMat->num_rows,
1655                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1656                         loTriFactor->csrMat->num_entries,
1657                       #endif
1658                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1659                         loTriFactor->csrMat->values->data().get(),
1660                         loTriFactor->csrMat->row_offsets->data().get(),
1661                         loTriFactor->csrMat->column_indices->data().get(),
1662                         loTriFactor->solveInfo,
1663                         barray, tempGPU->data().get()
1664                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1665                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1666                       #endif
1667 );CHKERRCUSPARSE(stat);
1668 
1669   /* Next, solve U */
1670   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671                         upTriFactor->csrMat->num_rows,
1672                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673                         upTriFactor->csrMat->num_entries,
1674                       #endif
1675                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676                         upTriFactor->csrMat->values->data().get(),
1677                         upTriFactor->csrMat->row_offsets->data().get(),
1678                         upTriFactor->csrMat->column_indices->data().get(),
1679                         upTriFactor->solveInfo,
1680                         tempGPU->data().get(), xarray
1681                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1682                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1683                       #endif
1684 );CHKERRCUSPARSE(stat);
1685 
1686   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1687   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1688   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1689   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1690   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1691   PetscFunctionReturn(0);
1692 }
1693 
1694 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1695 {
1696   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1697   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1698   cudaError_t        cerr;
1699   PetscErrorCode     ierr;
1700 
1701   PetscFunctionBegin;
1702   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1703     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1704 
1705     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1706     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1707     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1708     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1709     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1710     A->offloadmask = PETSC_OFFLOAD_BOTH;
1711   }
1712   PetscFunctionReturn(0);
1713 }
1714 
1715 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1716 {
1717   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1718   PetscErrorCode ierr;
1719 
1720   PetscFunctionBegin;
1721   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1722   *array = a->a;
1723   A->offloadmask = PETSC_OFFLOAD_CPU;
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1728 {
1729   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1730   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1731   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1732   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1733   PetscErrorCode               ierr;
1734   cusparseStatus_t             stat;
1735   PetscBool                    both = PETSC_TRUE;
1736   cudaError_t                  err;
1737 
1738   PetscFunctionBegin;
1739   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1740   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1741     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1742       CsrMatrix *matrix;
1743       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1744 
1745       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1746       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1747       matrix->values->assign(a->a, a->a+a->nz);
1748       err  = WaitForCUDA();CHKERRCUDA(err);
1749       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1750       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1751       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
1752     } else {
1753       PetscInt nnz;
1754       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1755       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1756       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1757       delete cusparsestruct->workVector;
1758       delete cusparsestruct->rowoffsets_gpu;
1759       cusparsestruct->workVector = NULL;
1760       cusparsestruct->rowoffsets_gpu = NULL;
1761       try {
1762         if (a->compressedrow.use) {
1763           m    = a->compressedrow.nrows;
1764           ii   = a->compressedrow.i;
1765           ridx = a->compressedrow.rindex;
1766         } else {
1767           m    = A->rmap->n;
1768           ii   = a->i;
1769           ridx = NULL;
1770         }
1771         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1772         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1773         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1774         else nnz = a->nz;
1775 
1776         /* create cusparse matrix */
1777         cusparsestruct->nrows = m;
1778         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1779         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1780         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1781         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1782 
1783         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1784         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1785         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1786         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1787         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1788         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1789         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1790 
1791         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1792         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1793           /* set the matrix */
1794           CsrMatrix *mat= new CsrMatrix;
1795           mat->num_rows = m;
1796           mat->num_cols = A->cmap->n;
1797           mat->num_entries = nnz;
1798           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1799           mat->row_offsets->assign(ii, ii + m+1);
1800 
1801           mat->column_indices = new THRUSTINTARRAY32(nnz);
1802           mat->column_indices->assign(a->j, a->j+nnz);
1803 
1804           mat->values = new THRUSTARRAY(nnz);
1805           if (a->a) mat->values->assign(a->a, a->a+nnz);
1806 
1807           /* assign the pointer */
1808           matstruct->mat = mat;
1809          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1810           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1811             stat = cusparseCreateCsr(&matstruct->matDescr,
1812                                     mat->num_rows, mat->num_cols, mat->num_entries,
1813                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1814                                     mat->values->data().get(),
1815                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1816                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1817           }
1818          #endif
1819         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1820          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1822          #else
1823           CsrMatrix *mat= new CsrMatrix;
1824           mat->num_rows = m;
1825           mat->num_cols = A->cmap->n;
1826           mat->num_entries = nnz;
1827           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1828           mat->row_offsets->assign(ii, ii + m+1);
1829 
1830           mat->column_indices = new THRUSTINTARRAY32(nnz);
1831           mat->column_indices->assign(a->j, a->j+nnz);
1832 
1833           mat->values = new THRUSTARRAY(nnz);
1834           if (a->a) mat->values->assign(a->a, a->a+nnz);
1835 
1836           cusparseHybMat_t hybMat;
1837           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1838           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1839             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1840           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1841               matstruct->descr, mat->values->data().get(),
1842               mat->row_offsets->data().get(),
1843               mat->column_indices->data().get(),
1844               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1845           /* assign the pointer */
1846           matstruct->mat = hybMat;
1847 
1848           if (mat) {
1849             if (mat->values) delete (THRUSTARRAY*)mat->values;
1850             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1851             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1852             delete (CsrMatrix*)mat;
1853           }
1854          #endif
1855         }
1856 
1857         /* assign the compressed row indices */
1858         if (a->compressedrow.use) {
1859           cusparsestruct->workVector = new THRUSTARRAY(m);
1860           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1861           matstruct->cprowIndices->assign(ridx,ridx+m);
1862           tmp = m;
1863         } else {
1864           cusparsestruct->workVector = NULL;
1865           matstruct->cprowIndices    = NULL;
1866           tmp = 0;
1867         }
1868         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1869 
1870         /* assign the pointer */
1871         cusparsestruct->mat = matstruct;
1872       } catch(char *ex) {
1873         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1874       }
1875       err  = WaitForCUDA();CHKERRCUDA(err);
1876       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1877       cusparsestruct->nonzerostate = A->nonzerostate;
1878     }
1879     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1880   }
1881   PetscFunctionReturn(0);
1882 }
1883 
1884 struct VecCUDAPlusEquals
1885 {
1886   template <typename Tuple>
1887   __host__ __device__
1888   void operator()(Tuple t)
1889   {
1890     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1891   }
1892 };
1893 
1894 struct VecCUDAEquals
1895 {
1896   template <typename Tuple>
1897   __host__ __device__
1898   void operator()(Tuple t)
1899   {
1900     thrust::get<1>(t) = thrust::get<0>(t);
1901   }
1902 };
1903 
1904 struct VecCUDAEqualsReverse
1905 {
1906   template <typename Tuple>
1907   __host__ __device__
1908   void operator()(Tuple t)
1909   {
1910     thrust::get<0>(t) = thrust::get<1>(t);
1911   }
1912 };
1913 
1914 struct MatMatCusparse {
1915   PetscBool             cisdense;
1916   PetscScalar           *Bt;
1917   Mat                   X;
1918   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1919   PetscLogDouble        flops;
1920   CsrMatrix             *Bcsr;
1921 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1922   cusparseSpMatDescr_t  matSpBDescr;
1923   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1924   cusparseDnMatDescr_t  matBDescr;
1925   cusparseDnMatDescr_t  matCDescr;
1926   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1927   size_t                mmBufferSize;
1928   void                  *mmBuffer;
1929   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1930   cusparseSpGEMMDescr_t spgemmDesc;
1931 #endif
1932 };
1933 
1934 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1935 {
1936   PetscErrorCode   ierr;
1937   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1938   cudaError_t      cerr;
1939  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1940   cusparseStatus_t stat;
1941  #endif
1942 
1943   PetscFunctionBegin;
1944   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1945   delete mmdata->Bcsr;
1946  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1947   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1948   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1949   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1950   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1951   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1952   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1953  #endif
1954   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1955   ierr = PetscFree(data);CHKERRQ(ierr);
1956   PetscFunctionReturn(0);
1957 }
1958 
1959 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1960 
1961 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1962 {
1963   Mat_Product                  *product = C->product;
1964   Mat                          A,B;
1965   PetscInt                     m,n,blda,clda;
1966   PetscBool                    flg,biscuda;
1967   Mat_SeqAIJCUSPARSE           *cusp;
1968   cusparseStatus_t             stat;
1969   cusparseOperation_t          opA;
1970   const PetscScalar            *barray;
1971   PetscScalar                  *carray;
1972   PetscErrorCode               ierr;
1973   MatMatCusparse               *mmdata;
1974   Mat_SeqAIJCUSPARSEMultStruct *mat;
1975   CsrMatrix                    *csrmat;
1976   cudaError_t                  cerr;
1977 
1978   PetscFunctionBegin;
1979   MatCheckProduct(C,1);
1980   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1981   mmdata = (MatMatCusparse*)product->data;
1982   A    = product->A;
1983   B    = product->B;
1984   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1985   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1986   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1987      Instead of silently accepting the wrong answer, I prefer to raise the error */
1988   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1989   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1990   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1991   switch (product->type) {
1992   case MATPRODUCT_AB:
1993   case MATPRODUCT_PtAP:
1994     mat = cusp->mat;
1995     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1996     m   = A->rmap->n;
1997     n   = B->cmap->n;
1998     break;
1999   case MATPRODUCT_AtB:
2000     if (!cusp->transgen) {
2001       mat = cusp->mat;
2002       opA = CUSPARSE_OPERATION_TRANSPOSE;
2003     } else {
2004       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2005       mat  = cusp->matTranspose;
2006       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007     }
2008     m = A->cmap->n;
2009     n = B->cmap->n;
2010     break;
2011   case MATPRODUCT_ABt:
2012   case MATPRODUCT_RARt:
2013     mat = cusp->mat;
2014     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2015     m   = A->rmap->n;
2016     n   = B->rmap->n;
2017     break;
2018   default:
2019     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2020   }
2021   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2022   csrmat = (CsrMatrix*)mat->mat;
2023   /* if the user passed a CPU matrix, copy the data to the GPU */
2024   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2025   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2026   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2027 
2028   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2029   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2030     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2031     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2032   } else {
2033     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2034     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2035   }
2036 
2037   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2038  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2039   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2040   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2041   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2042     size_t mmBufferSize;
2043     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2044     if (!mmdata->matBDescr) {
2045       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2046       mmdata->Blda = blda;
2047     }
2048 
2049     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2050     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2051       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2052       mmdata->Clda = clda;
2053     }
2054 
2055     if (!mat->matDescr) {
2056       stat = cusparseCreateCsr(&mat->matDescr,
2057                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2058                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2059                                csrmat->values->data().get(),
2060                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2061                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2062     }
2063     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2064                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2065                                    mmdata->matCDescr,cusparse_scalartype,
2066                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2067     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2068       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2069       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2070       mmdata->mmBufferSize = mmBufferSize;
2071     }
2072     mmdata->initialized = PETSC_TRUE;
2073   } else {
2074     /* to be safe, always update pointers of the mats */
2075     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2076     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2077     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2078   }
2079 
2080   /* do cusparseSpMM, which supports transpose on B */
2081   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2082                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2083                       mmdata->matCDescr,cusparse_scalartype,
2084                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2085  #else
2086   PetscInt k;
2087   /* cusparseXcsrmm does not support transpose on B */
2088   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2089     cublasHandle_t cublasv2handle;
2090     cublasStatus_t cerr;
2091 
2092     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2093     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2094                        B->cmap->n,B->rmap->n,
2095                        &PETSC_CUSPARSE_ONE ,barray,blda,
2096                        &PETSC_CUSPARSE_ZERO,barray,blda,
2097                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2098     blda = B->cmap->n;
2099     k    = B->cmap->n;
2100   } else {
2101     k    = B->rmap->n;
2102   }
2103 
2104   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2105   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2106                            csrmat->num_entries,mat->alpha_one,mat->descr,
2107                            csrmat->values->data().get(),
2108                            csrmat->row_offsets->data().get(),
2109                            csrmat->column_indices->data().get(),
2110                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2111                            carray,clda);CHKERRCUSPARSE(stat);
2112  #endif
2113   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2114   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2115   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2116   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2117   if (product->type == MATPRODUCT_RARt) {
2118     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2119     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2120   } else if (product->type == MATPRODUCT_PtAP) {
2121     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2122     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2123   } else {
2124     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2125   }
2126   if (mmdata->cisdense) {
2127     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2128   }
2129   if (!biscuda) {
2130     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2131   }
2132   PetscFunctionReturn(0);
2133 }
2134 
2135 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2136 {
2137   Mat_Product        *product = C->product;
2138   Mat                A,B;
2139   PetscInt           m,n;
2140   PetscBool          cisdense,flg;
2141   PetscErrorCode     ierr;
2142   MatMatCusparse     *mmdata;
2143   Mat_SeqAIJCUSPARSE *cusp;
2144 
2145   PetscFunctionBegin;
2146   MatCheckProduct(C,1);
2147   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2148   A    = product->A;
2149   B    = product->B;
2150   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2151   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2152   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2153   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2154   switch (product->type) {
2155   case MATPRODUCT_AB:
2156     m = A->rmap->n;
2157     n = B->cmap->n;
2158     break;
2159   case MATPRODUCT_AtB:
2160     m = A->cmap->n;
2161     n = B->cmap->n;
2162     break;
2163   case MATPRODUCT_ABt:
2164     m = A->rmap->n;
2165     n = B->rmap->n;
2166     break;
2167   case MATPRODUCT_PtAP:
2168     m = B->cmap->n;
2169     n = B->cmap->n;
2170     break;
2171   case MATPRODUCT_RARt:
2172     m = B->rmap->n;
2173     n = B->rmap->n;
2174     break;
2175   default:
2176     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2177   }
2178   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2179   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2180   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2181   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2182 
2183   /* product data */
2184   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2185   mmdata->cisdense = cisdense;
2186  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2187   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2188   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2189     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2190   }
2191  #endif
2192   /* for these products we need intermediate storage */
2193   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2194     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2195     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2196     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2197       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2198     } else {
2199       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2200     }
2201   }
2202   C->product->data    = mmdata;
2203   C->product->destroy = MatDestroy_MatMatCusparse;
2204 
2205   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2206   PetscFunctionReturn(0);
2207 }
2208 
2209 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2210 {
2211   Mat_Product                  *product = C->product;
2212   Mat                          A,B;
2213   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2214   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2215   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2216   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2217   PetscBool                    flg;
2218   PetscErrorCode               ierr;
2219   cusparseStatus_t             stat;
2220   cudaError_t                  cerr;
2221   MatProductType               ptype;
2222   MatMatCusparse               *mmdata;
2223 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2224   cusparseSpMatDescr_t         BmatSpDescr;
2225 #endif
2226 
2227   PetscFunctionBegin;
2228   MatCheckProduct(C,1);
2229   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2230   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2231   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2232   mmdata = (MatMatCusparse*)C->product->data;
2233   A = product->A;
2234   B = product->B;
2235   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2236     mmdata->reusesym = PETSC_FALSE;
2237     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2238     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2239     Cmat = Ccusp->mat;
2240     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2241     Ccsr = (CsrMatrix*)Cmat->mat;
2242     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2243     goto finalize;
2244   }
2245   if (!c->nz) goto finalize;
2246   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2248   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2249   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2250   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2251   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2253   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2254   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2255   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2256   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2257   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2258   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2259   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2260 
2261   ptype = product->type;
2262   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2263   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2264   switch (ptype) {
2265   case MATPRODUCT_AB:
2266     Amat = Acusp->mat;
2267     Bmat = Bcusp->mat;
2268     break;
2269   case MATPRODUCT_AtB:
2270     Amat = Acusp->matTranspose;
2271     Bmat = Bcusp->mat;
2272     break;
2273   case MATPRODUCT_ABt:
2274     Amat = Acusp->mat;
2275     Bmat = Bcusp->matTranspose;
2276     break;
2277   default:
2278     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2279   }
2280   Cmat = Ccusp->mat;
2281   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2282   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2283   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2284   Acsr = (CsrMatrix*)Amat->mat;
2285   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2286   Ccsr = (CsrMatrix*)Cmat->mat;
2287   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2288   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2289   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2290   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2291 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2292   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2293   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2294                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2295                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2296                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2297   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2298                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2299                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2300 #else
2301   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2302                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2303                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2304                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2305                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2306 #endif
2307   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2308   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2309   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2310   C->offloadmask = PETSC_OFFLOAD_GPU;
2311 finalize:
2312   /* shorter version of MatAssemblyEnd_SeqAIJ */
2313   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2314   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2315   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2316   c->reallocs         = 0;
2317   C->info.mallocs    += 0;
2318   C->info.nz_unneeded = 0;
2319   C->assembled = C->was_assembled = PETSC_TRUE;
2320   C->num_ass++;
2321   PetscFunctionReturn(0);
2322 }
2323 
2324 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2325 {
2326   Mat_Product                  *product = C->product;
2327   Mat                          A,B;
2328   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2329   Mat_SeqAIJ                   *a,*b,*c;
2330   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2331   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2332   PetscInt                     i,j,m,n,k;
2333   PetscBool                    flg;
2334   PetscErrorCode               ierr;
2335   cusparseStatus_t             stat;
2336   cudaError_t                  cerr;
2337   MatProductType               ptype;
2338   MatMatCusparse               *mmdata;
2339   PetscLogDouble               flops;
2340   PetscBool                    biscompressed,ciscompressed;
2341 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2342   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2343   size_t                       bufSize2;
2344   cusparseSpMatDescr_t         BmatSpDescr;
2345 #else
2346   int                          cnz;
2347 #endif
2348 
2349   PetscFunctionBegin;
2350   MatCheckProduct(C,1);
2351   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2352   A    = product->A;
2353   B    = product->B;
2354   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2355   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2356   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2357   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2358   a = (Mat_SeqAIJ*)A->data;
2359   b = (Mat_SeqAIJ*)B->data;
2360   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2361   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2362   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2363   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2364 
2365   /* product data */
2366   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2367   C->product->data    = mmdata;
2368   C->product->destroy = MatDestroy_MatMatCusparse;
2369 
2370   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2371   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2372   ptype = product->type;
2373   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2374   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2375   biscompressed = PETSC_FALSE;
2376   ciscompressed = PETSC_FALSE;
2377   switch (ptype) {
2378   case MATPRODUCT_AB:
2379     m = A->rmap->n;
2380     n = B->cmap->n;
2381     k = A->cmap->n;
2382     Amat = Acusp->mat;
2383     Bmat = Bcusp->mat;
2384     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2385     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2386     break;
2387   case MATPRODUCT_AtB:
2388     m = A->cmap->n;
2389     n = B->cmap->n;
2390     k = A->rmap->n;
2391     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2392     Amat = Acusp->matTranspose;
2393     Bmat = Bcusp->mat;
2394     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2395     break;
2396   case MATPRODUCT_ABt:
2397     m = A->rmap->n;
2398     n = B->rmap->n;
2399     k = A->cmap->n;
2400     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2401     Amat = Acusp->mat;
2402     Bmat = Bcusp->matTranspose;
2403     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2404     break;
2405   default:
2406     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2407   }
2408 
2409   /* create cusparse matrix */
2410   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2411   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2412   c     = (Mat_SeqAIJ*)C->data;
2413   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2414   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2415   Ccsr  = new CsrMatrix;
2416 
2417   c->compressedrow.use = ciscompressed;
2418   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2419     c->compressedrow.nrows = a->compressedrow.nrows;
2420     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2421     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2422     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2423     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2424     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2425   } else {
2426     c->compressedrow.nrows  = 0;
2427     c->compressedrow.i      = NULL;
2428     c->compressedrow.rindex = NULL;
2429     Ccusp->workVector       = NULL;
2430     Cmat->cprowIndices      = NULL;
2431   }
2432   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2433   Ccusp->mat      = Cmat;
2434   Ccusp->mat->mat = Ccsr;
2435   Ccsr->num_rows    = Ccusp->nrows;
2436   Ccsr->num_cols    = n;
2437   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2438   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2439   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2440   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2441   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2442   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2443   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2444   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2445   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2446   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2447   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2448     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2449     c->nz = 0;
2450     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2451     Ccsr->values = new THRUSTARRAY(c->nz);
2452     goto finalizesym;
2453   }
2454 
2455   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2456   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2457   Acsr = (CsrMatrix*)Amat->mat;
2458   if (!biscompressed) {
2459     Bcsr = (CsrMatrix*)Bmat->mat;
2460 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2461     BmatSpDescr = Bmat->matDescr;
2462 #endif
2463   } else { /* we need to use row offsets for the full matrix */
2464     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2465     Bcsr = new CsrMatrix;
2466     Bcsr->num_rows       = B->rmap->n;
2467     Bcsr->num_cols       = cBcsr->num_cols;
2468     Bcsr->num_entries    = cBcsr->num_entries;
2469     Bcsr->column_indices = cBcsr->column_indices;
2470     Bcsr->values         = cBcsr->values;
2471     if (!Bcusp->rowoffsets_gpu) {
2472       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2473       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2474       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2475     }
2476     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2477     mmdata->Bcsr = Bcsr;
2478 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2479     if (Bcsr->num_rows && Bcsr->num_cols) {
2480       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2481                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2482                                Bcsr->values->data().get(),
2483                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2484                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2485     }
2486     BmatSpDescr = mmdata->matSpBDescr;
2487 #endif
2488   }
2489   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2490   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2491   /* precompute flops count */
2492   if (ptype == MATPRODUCT_AB) {
2493     for (i=0, flops = 0; i<A->rmap->n; i++) {
2494       const PetscInt st = a->i[i];
2495       const PetscInt en = a->i[i+1];
2496       for (j=st; j<en; j++) {
2497         const PetscInt brow = a->j[j];
2498         flops += 2.*(b->i[brow+1] - b->i[brow]);
2499       }
2500     }
2501   } else if (ptype == MATPRODUCT_AtB) {
2502     for (i=0, flops = 0; i<A->rmap->n; i++) {
2503       const PetscInt anzi = a->i[i+1] - a->i[i];
2504       const PetscInt bnzi = b->i[i+1] - b->i[i];
2505       flops += (2.*anzi)*bnzi;
2506     }
2507   } else { /* TODO */
2508     flops = 0.;
2509   }
2510 
2511   mmdata->flops = flops;
2512   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2513 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2514   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2515   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2516                            NULL, NULL, NULL,
2517                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2518                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2519   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2520   /* ask bufferSize bytes for external memory */
2521   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2522                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2523                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2524                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2525   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2526   /* inspect the matrices A and B to understand the memory requirement for the next step */
2527   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2528                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2529                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2530                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2531   /* ask bufferSize again bytes for external memory */
2532   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2536   /* The CUSPARSE documentation is not clear, nor the API
2537      We need both buffers to perform the operations properly!
2538      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2539      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2540      is stored in the descriptor! What a messy API... */
2541   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2542   /* compute the intermediate product of A * B */
2543   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2547   /* get matrix C non-zero entries C_nnz1 */
2548   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2549   c->nz = (PetscInt) C_nnz1;
2550   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2551   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2552   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2553   Ccsr->values = new THRUSTARRAY(c->nz);
2554   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2555   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2556                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2557   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2558                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2559                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2560 #else
2561   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2562   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2563                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2564                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2565                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2566                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2567   c->nz = cnz;
2568   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2569   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2570   Ccsr->values = new THRUSTARRAY(c->nz);
2571   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2572 
2573   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2574   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2575      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2576      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2577   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2578                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2579                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2580                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2581                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2582 #endif
2583   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2584   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2585   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2586 finalizesym:
2587   c->singlemalloc = PETSC_FALSE;
2588   c->free_a       = PETSC_TRUE;
2589   c->free_ij      = PETSC_TRUE;
2590   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2591   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2592   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2593     PetscInt *d_i = c->i;
2594     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2595     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2596     ii   = *Ccsr->row_offsets;
2597     jj   = *Ccsr->column_indices;
2598     if (ciscompressed) d_i = c->compressedrow.i;
2599     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2600     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2601   } else {
2602     PetscInt *d_i = c->i;
2603     if (ciscompressed) d_i = c->compressedrow.i;
2604     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2605     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2606   }
2607   if (ciscompressed) { /* need to expand host row offsets */
2608     PetscInt r = 0;
2609     c->i[0] = 0;
2610     for (k = 0; k < c->compressedrow.nrows; k++) {
2611       const PetscInt next = c->compressedrow.rindex[k];
2612       const PetscInt old = c->compressedrow.i[k];
2613       for (; r < next; r++) c->i[r+1] = old;
2614     }
2615     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2616   }
2617   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2618   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2619   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2620   c->maxnz = c->nz;
2621   c->nonzerorowcnt = 0;
2622   c->rmax = 0;
2623   for (k = 0; k < m; k++) {
2624     const PetscInt nn = c->i[k+1] - c->i[k];
2625     c->ilen[k] = c->imax[k] = nn;
2626     c->nonzerorowcnt += (PetscInt)!!nn;
2627     c->rmax = PetscMax(c->rmax,nn);
2628   }
2629   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2630   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2631   Ccsr->num_entries = c->nz;
2632 
2633   C->nonzerostate++;
2634   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2635   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2636   Ccusp->nonzerostate = C->nonzerostate;
2637   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2638   C->preallocated  = PETSC_TRUE;
2639   C->assembled     = PETSC_FALSE;
2640   C->was_assembled = PETSC_FALSE;
2641   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2642     mmdata->reusesym = PETSC_TRUE;
2643     C->offloadmask   = PETSC_OFFLOAD_GPU;
2644   }
2645   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2646   PetscFunctionReturn(0);
2647 }
2648 
2649 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2650 
2651 /* handles sparse or dense B */
2652 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2653 {
2654   Mat_Product    *product = mat->product;
2655   PetscErrorCode ierr;
2656   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2657 
2658   PetscFunctionBegin;
2659   MatCheckProduct(mat,1);
2660   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2661   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2662     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2663   }
2664   if (product->type == MATPRODUCT_ABC) {
2665     Ciscusp = PETSC_FALSE;
2666     if (!product->C->boundtocpu) {
2667       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2668     }
2669   }
2670   if (isdense) {
2671     switch (product->type) {
2672     case MATPRODUCT_AB:
2673     case MATPRODUCT_AtB:
2674     case MATPRODUCT_ABt:
2675     case MATPRODUCT_PtAP:
2676     case MATPRODUCT_RARt:
2677      if (product->A->boundtocpu) {
2678         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2679       } else {
2680         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2681       }
2682       break;
2683     case MATPRODUCT_ABC:
2684       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2685       break;
2686     default:
2687       break;
2688     }
2689   } else if (Biscusp && Ciscusp) {
2690     switch (product->type) {
2691     case MATPRODUCT_AB:
2692     case MATPRODUCT_AtB:
2693     case MATPRODUCT_ABt:
2694       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2695       break;
2696     case MATPRODUCT_PtAP:
2697     case MATPRODUCT_RARt:
2698     case MATPRODUCT_ABC:
2699       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2700       break;
2701     default:
2702       break;
2703     }
2704   } else { /* fallback for AIJ */
2705     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2706   }
2707   PetscFunctionReturn(0);
2708 }
2709 
2710 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2711 {
2712   PetscErrorCode ierr;
2713 
2714   PetscFunctionBegin;
2715   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2716   PetscFunctionReturn(0);
2717 }
2718 
2719 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2720 {
2721   PetscErrorCode ierr;
2722 
2723   PetscFunctionBegin;
2724   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2729 {
2730   PetscErrorCode ierr;
2731 
2732   PetscFunctionBegin;
2733   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2734   PetscFunctionReturn(0);
2735 }
2736 
2737 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2738 {
2739   PetscErrorCode ierr;
2740 
2741   PetscFunctionBegin;
2742   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2743   PetscFunctionReturn(0);
2744 }
2745 
2746 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2747 {
2748   PetscErrorCode ierr;
2749 
2750   PetscFunctionBegin;
2751   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2752   PetscFunctionReturn(0);
2753 }
2754 
2755 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2756 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2757 {
2758   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2759   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2760   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2761   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2762   PetscErrorCode               ierr;
2763   cudaError_t                  cerr;
2764   cusparseStatus_t             stat;
2765   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2766   PetscBool                    compressed;
2767 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2768   PetscInt                     nx,ny;
2769 #endif
2770 
2771   PetscFunctionBegin;
2772   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2773   if (!a->nonzerorowcnt) {
2774     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2775     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2776     PetscFunctionReturn(0);
2777   }
2778   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2779   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2780   if (!trans) {
2781     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2782     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2783   } else {
2784     if (herm || !cusparsestruct->transgen) {
2785       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2786       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2787     } else {
2788       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2789       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2790     }
2791   }
2792   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2793   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2794 
2795   try {
2796     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2797     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2798     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2799 
2800     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2801     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2802       /* z = A x + beta y.
2803          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2804          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2805       */
2806       xptr = xarray;
2807       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2808       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2809      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2810       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2811           allocated to accommodate different uses. So we get the length info directly from mat.
2812        */
2813       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2814         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2815         nx = mat->num_cols;
2816         ny = mat->num_rows;
2817       }
2818      #endif
2819     } else {
2820       /* z = A^T x + beta y
2821          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2822          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2823        */
2824       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2825       dptr = zarray;
2826       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2827       if (compressed) { /* Scatter x to work vector */
2828         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2829         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2830                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2831                          VecCUDAEqualsReverse());
2832       }
2833      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2834       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2835         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2836         nx = mat->num_rows;
2837         ny = mat->num_cols;
2838       }
2839      #endif
2840     }
2841 
2842     /* csr_spmv does y = alpha op(A) x + beta y */
2843     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2844      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2845       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2846       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2847         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2848         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2849         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2850                                 matstruct->matDescr,
2851                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2852                                 matstruct->cuSpMV[opA].vecYDescr,
2853                                 cusparse_scalartype,
2854                                 cusparsestruct->spmvAlg,
2855                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2856         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2857 
2858         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2859       } else {
2860         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2861         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2862         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2863       }
2864 
2865       stat = cusparseSpMV(cusparsestruct->handle, opA,
2866                                matstruct->alpha_one,
2867                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2868                                matstruct->cuSpMV[opA].vecXDescr,
2869                                beta,
2870                                matstruct->cuSpMV[opA].vecYDescr,
2871                                cusparse_scalartype,
2872                                cusparsestruct->spmvAlg,
2873                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2874      #else
2875       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2876       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2877                                mat->num_rows, mat->num_cols,
2878                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2879                                mat->values->data().get(), mat->row_offsets->data().get(),
2880                                mat->column_indices->data().get(), xptr, beta,
2881                                dptr);CHKERRCUSPARSE(stat);
2882      #endif
2883     } else {
2884       if (cusparsestruct->nrows) {
2885        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2886         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2887        #else
2888         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2889         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2890                                  matstruct->alpha_one, matstruct->descr, hybMat,
2891                                  xptr, beta,
2892                                  dptr);CHKERRCUSPARSE(stat);
2893        #endif
2894       }
2895     }
2896     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2897     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2898 
2899     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2900       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2901         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2902           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2903         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2904           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2905         }
2906       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2907         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2908       }
2909 
2910       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2911       if (compressed) {
2912         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2913         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2914         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2915                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2916                          VecCUDAPlusEquals());
2917         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2918         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2919       }
2920     } else {
2921       if (yy && yy != zz) {
2922         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2923       }
2924     }
2925     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2926     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2927     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2928   } catch(char *ex) {
2929     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2930   }
2931   if (yy) {
2932     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2933   } else {
2934     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2935   }
2936   PetscFunctionReturn(0);
2937 }
2938 
2939 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940 {
2941   PetscErrorCode ierr;
2942 
2943   PetscFunctionBegin;
2944   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2945   PetscFunctionReturn(0);
2946 }
2947 
2948 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2949 {
2950   PetscErrorCode              ierr;
2951   PetscSplitCSRDataStructure  *d_mat = NULL;
2952   PetscFunctionBegin;
2953   if (A->factortype == MAT_FACTOR_NONE) {
2954     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2955   }
2956   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2957   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2958   if (d_mat) {
2959     A->offloadmask = PETSC_OFFLOAD_GPU;
2960   }
2961 
2962   PetscFunctionReturn(0);
2963 }
2964 
2965 /* --------------------------------------------------------------------------------*/
2966 /*@
2967    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2968    (the default parallel PETSc format). This matrix will ultimately pushed down
2969    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2970    assembly performance the user should preallocate the matrix storage by setting
2971    the parameter nz (or the array nnz).  By setting these parameters accurately,
2972    performance during matrix assembly can be increased by more than a factor of 50.
2973 
2974    Collective
2975 
2976    Input Parameters:
2977 +  comm - MPI communicator, set to PETSC_COMM_SELF
2978 .  m - number of rows
2979 .  n - number of columns
2980 .  nz - number of nonzeros per row (same for all rows)
2981 -  nnz - array containing the number of nonzeros in the various rows
2982          (possibly different for each row) or NULL
2983 
2984    Output Parameter:
2985 .  A - the matrix
2986 
2987    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2988    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2989    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2990 
2991    Notes:
2992    If nnz is given then nz is ignored
2993 
2994    The AIJ format (also called the Yale sparse matrix format or
2995    compressed row storage), is fully compatible with standard Fortran 77
2996    storage.  That is, the stored row and column indices can begin at
2997    either one (as in Fortran) or zero.  See the users' manual for details.
2998 
2999    Specify the preallocated storage with either nz or nnz (not both).
3000    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
3001    allocation.  For large problems you MUST preallocate memory or you
3002    will get TERRIBLE performance, see the users' manual chapter on matrices.
3003 
3004    By default, this format uses inodes (identical nodes) when possible, to
3005    improve numerical efficiency of matrix-vector products and solves. We
3006    search for consecutive rows with the same nonzero structure, thereby
3007    reusing matrix information to achieve increased efficiency.
3008 
3009    Level: intermediate
3010 
3011 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
3012 @*/
3013 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
3014 {
3015   PetscErrorCode ierr;
3016 
3017   PetscFunctionBegin;
3018   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3019   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
3020   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3021   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
3022   PetscFunctionReturn(0);
3023 }
3024 
3025 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3026 {
3027   PetscErrorCode              ierr;
3028   PetscSplitCSRDataStructure  *d_mat = NULL;
3029 
3030   PetscFunctionBegin;
3031   if (A->factortype == MAT_FACTOR_NONE) {
3032     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3033     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3034     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3035   } else {
3036     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3037   }
3038   if (d_mat) {
3039     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3040     cudaError_t                err;
3041     PetscSplitCSRDataStructure h_mat;
3042     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3043     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3044     if (a->compressedrow.use) {
3045       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3046     }
3047     err = cudaFree(d_mat);CHKERRCUDA(err);
3048   }
3049   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3050   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3051   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3052   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3053   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3054   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3055   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3056   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3057   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3058   PetscFunctionReturn(0);
3059 }
3060 
3061 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3062 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3063 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3064 {
3065   PetscErrorCode ierr;
3066 
3067   PetscFunctionBegin;
3068   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3069   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3070   PetscFunctionReturn(0);
3071 }
3072 
3073 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3074 {
3075   PetscErrorCode     ierr;
3076   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3077   Mat_SeqAIJCUSPARSE *cy;
3078   Mat_SeqAIJCUSPARSE *cx;
3079   PetscScalar        *ay;
3080   const PetscScalar  *ax;
3081   CsrMatrix          *csry,*csrx;
3082   cudaError_t        cerr;
3083 
3084   PetscFunctionBegin;
3085   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3086   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3087   if (X->ops->axpy != Y->ops->axpy) {
3088     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3089     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3090     PetscFunctionReturn(0);
3091   }
3092   /* if we are here, it means both matrices are bound to GPU */
3093   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3094   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3095   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3096   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3097   csry = (CsrMatrix*)cy->mat->mat;
3098   csrx = (CsrMatrix*)cx->mat->mat;
3099   /* see if we can turn this into a cublas axpy */
3100   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3101     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3102     if (eq) {
3103       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3104     }
3105     if (eq) str = SAME_NONZERO_PATTERN;
3106   }
3107   /* spgeam is buggy with one column */
3108   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3109 
3110   if (str == SUBSET_NONZERO_PATTERN) {
3111     cusparseStatus_t stat;
3112     PetscScalar      b = 1.0;
3113 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114     size_t           bufferSize;
3115     void             *buffer;
3116 #endif
3117 
3118     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3119     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3120     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3121 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3122     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3123                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3124                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3125                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3126     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3127     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3128     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3129                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3130                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3131                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3132     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3133     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3134     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3135     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3136 #else
3137     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3138     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3139                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3140                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3141                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3142     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3143     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3144     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3145 #endif
3146     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3147     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3148     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3149     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3150   } else if (str == SAME_NONZERO_PATTERN) {
3151     cublasHandle_t cublasv2handle;
3152     cublasStatus_t berr;
3153     PetscBLASInt   one = 1, bnz = 1;
3154 
3155     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3156     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3157     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3158     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3159     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3160     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3161     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3162     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3163     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3164     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3165     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3166     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3167   } else {
3168     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3169     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3170   }
3171   PetscFunctionReturn(0);
3172 }
3173 
3174 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3175 {
3176   PetscErrorCode ierr;
3177   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3178   PetscScalar    *ay;
3179   cudaError_t    cerr;
3180   cublasHandle_t cublasv2handle;
3181   cublasStatus_t berr;
3182   PetscBLASInt   one = 1, bnz = 1;
3183 
3184   PetscFunctionBegin;
3185   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3186   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3187   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3188   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3189   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3190   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3191   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3192   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3193   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3194   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3195   PetscFunctionReturn(0);
3196 }
3197 
3198 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3199 {
3200   PetscErrorCode             ierr;
3201   PetscBool                  both = PETSC_FALSE;
3202   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3203 
3204   PetscFunctionBegin;
3205   if (A->factortype == MAT_FACTOR_NONE) {
3206     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3207     if (spptr->mat) {
3208       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3209       if (matrix->values) {
3210         both = PETSC_TRUE;
3211         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3212       }
3213     }
3214     if (spptr->matTranspose) {
3215       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3216       if (matrix->values) {
3217         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3218       }
3219     }
3220   }
3221   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3222   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3223   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3224   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3225   else A->offloadmask = PETSC_OFFLOAD_CPU;
3226 
3227   PetscFunctionReturn(0);
3228 }
3229 
3230 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3231 {
3232   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3233   PetscErrorCode ierr;
3234 
3235   PetscFunctionBegin;
3236   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3237   if (flg) {
3238     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3239 
3240     A->ops->scale                     = MatScale_SeqAIJ;
3241     A->ops->axpy                      = MatAXPY_SeqAIJ;
3242     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3243     A->ops->mult                      = MatMult_SeqAIJ;
3244     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3245     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3246     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3247     A->ops->multhermitiantranspose    = NULL;
3248     A->ops->multhermitiantransposeadd = NULL;
3249     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3250     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3251     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3252     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3253     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3254     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3255     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3256     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3257   } else {
3258     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3259     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3260     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3261     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3262     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3263     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3264     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3265     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3266     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3267     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3268     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3269     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3270     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3271     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3272     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3273     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3274     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3275   }
3276   A->boundtocpu = flg;
3277   a->inode.use = flg;
3278   PetscFunctionReturn(0);
3279 }
3280 
3281 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3282 {
3283   PetscErrorCode   ierr;
3284   cusparseStatus_t stat;
3285   Mat              B;
3286 
3287   PetscFunctionBegin;
3288   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3289   if (reuse == MAT_INITIAL_MATRIX) {
3290     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3291   } else if (reuse == MAT_REUSE_MATRIX) {
3292     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3293   }
3294   B = *newmat;
3295 
3296   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3297   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3298 
3299   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3300     if (B->factortype == MAT_FACTOR_NONE) {
3301       Mat_SeqAIJCUSPARSE *spptr;
3302 
3303       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3304       spptr->format = MAT_CUSPARSE_CSR;
3305       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3306       B->spptr = spptr;
3307       spptr->deviceMat = NULL;
3308     } else {
3309       Mat_SeqAIJCUSPARSETriFactors *spptr;
3310 
3311       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3312       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3313       B->spptr = spptr;
3314     }
3315     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3316   }
3317   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3318   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3319   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3320   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3321   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3322 
3323   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3324   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3325   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3326   PetscFunctionReturn(0);
3327 }
3328 
3329 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3330 {
3331   PetscErrorCode ierr;
3332 
3333   PetscFunctionBegin;
3334   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3335   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3336   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3337   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3338   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 /*MC
3343    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3344 
3345    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3346    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3347    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3348 
3349    Options Database Keys:
3350 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3351 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3352 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3353 
3354   Level: beginner
3355 
3356 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3357 M*/
3358 
3359 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3360 
3361 
3362 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3363 {
3364   PetscErrorCode ierr;
3365 
3366   PetscFunctionBegin;
3367   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3368   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3369   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3370   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3371   PetscFunctionReturn(0);
3372 }
3373 
3374 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3375 {
3376   PetscErrorCode   ierr;
3377   cusparseStatus_t stat;
3378 
3379   PetscFunctionBegin;
3380   if (*cusparsestruct) {
3381     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3382     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3383     delete (*cusparsestruct)->workVector;
3384     delete (*cusparsestruct)->rowoffsets_gpu;
3385     delete (*cusparsestruct)->cooPerm;
3386     delete (*cusparsestruct)->cooPerm_a;
3387     delete (*cusparsestruct)->csr2csc_i;
3388     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3389     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3390   }
3391   PetscFunctionReturn(0);
3392 }
3393 
3394 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3395 {
3396   PetscFunctionBegin;
3397   if (*mat) {
3398     delete (*mat)->values;
3399     delete (*mat)->column_indices;
3400     delete (*mat)->row_offsets;
3401     delete *mat;
3402     *mat = 0;
3403   }
3404   PetscFunctionReturn(0);
3405 }
3406 
3407 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3408 {
3409   cusparseStatus_t stat;
3410   PetscErrorCode   ierr;
3411 
3412   PetscFunctionBegin;
3413   if (*trifactor) {
3414     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3415     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3416     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3417     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3418     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3419    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3420     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3421    #endif
3422     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3423   }
3424   PetscFunctionReturn(0);
3425 }
3426 
3427 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3428 {
3429   CsrMatrix        *mat;
3430   cusparseStatus_t stat;
3431   cudaError_t      err;
3432 
3433   PetscFunctionBegin;
3434   if (*matstruct) {
3435     if ((*matstruct)->mat) {
3436       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3437        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3438         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3439        #else
3440         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3441         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3442        #endif
3443       } else {
3444         mat = (CsrMatrix*)(*matstruct)->mat;
3445         CsrMatrix_Destroy(&mat);
3446       }
3447     }
3448     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3449     delete (*matstruct)->cprowIndices;
3450     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3451     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3452     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3453 
3454    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3455     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3456     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3457     for (int i=0; i<3; i++) {
3458       if (mdata->cuSpMV[i].initialized) {
3459         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3460         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3461         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3462       }
3463     }
3464    #endif
3465     delete *matstruct;
3466     *matstruct = NULL;
3467   }
3468   PetscFunctionReturn(0);
3469 }
3470 
3471 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3472 {
3473   PetscErrorCode ierr;
3474 
3475   PetscFunctionBegin;
3476   if (*trifactors) {
3477     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3478     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3479     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3480     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3481     delete (*trifactors)->rpermIndices;
3482     delete (*trifactors)->cpermIndices;
3483     delete (*trifactors)->workVector;
3484     (*trifactors)->rpermIndices = NULL;
3485     (*trifactors)->cpermIndices = NULL;
3486     (*trifactors)->workVector = NULL;
3487   }
3488   PetscFunctionReturn(0);
3489 }
3490 
3491 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3492 {
3493   PetscErrorCode   ierr;
3494   cusparseHandle_t handle;
3495   cusparseStatus_t stat;
3496 
3497   PetscFunctionBegin;
3498   if (*trifactors) {
3499     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3500     if (handle = (*trifactors)->handle) {
3501       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3502     }
3503     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3504   }
3505   PetscFunctionReturn(0);
3506 }
3507 
3508 struct IJCompare
3509 {
3510   __host__ __device__
3511   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3512   {
3513     if (t1.get<0>() < t2.get<0>()) return true;
3514     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3515     return false;
3516   }
3517 };
3518 
3519 struct IJEqual
3520 {
3521   __host__ __device__
3522   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3523   {
3524     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3525     return true;
3526   }
3527 };
3528 
3529 struct IJDiff
3530 {
3531   __host__ __device__
3532   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3533   {
3534     return t1 == t2 ? 0 : 1;
3535   }
3536 };
3537 
3538 struct IJSum
3539 {
3540   __host__ __device__
3541   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3542   {
3543     return t1||t2;
3544   }
3545 };
3546 
3547 #include <thrust/iterator/discard_iterator.h>
3548 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3549 {
3550   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3551   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3552   THRUSTARRAY                           *cooPerm_v = NULL;
3553   thrust::device_ptr<const PetscScalar> d_v;
3554   CsrMatrix                             *matrix;
3555   PetscErrorCode                        ierr;
3556   cudaError_t                           cerr;
3557   PetscInt                              n;
3558 
3559   PetscFunctionBegin;
3560   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3561   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3562   if (!cusp->cooPerm) {
3563     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3564     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3565     PetscFunctionReturn(0);
3566   }
3567   matrix = (CsrMatrix*)cusp->mat->mat;
3568   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3569   if (!v) {
3570     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3571     goto finalize;
3572   }
3573   n = cusp->cooPerm->size();
3574   if (isCudaMem(v)) {
3575     d_v = thrust::device_pointer_cast(v);
3576   } else {
3577     cooPerm_v = new THRUSTARRAY(n);
3578     cooPerm_v->assign(v,v+n);
3579     d_v = cooPerm_v->data();
3580     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3581   }
3582   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3583   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3584     if (cusp->cooPerm_a) {
3585       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3586       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3587       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3588       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3589       delete cooPerm_w;
3590     } else {
3591       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3592                                                                 matrix->values->begin()));
3593       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3594                                                                 matrix->values->end()));
3595       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3596     }
3597   } else {
3598     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3599       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3600       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3601     } else {
3602       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3603                                                                 matrix->values->begin()));
3604       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3605                                                                 matrix->values->end()));
3606       thrust::for_each(zibit,zieit,VecCUDAEquals());
3607     }
3608   }
3609   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3610   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3611 finalize:
3612   delete cooPerm_v;
3613   A->offloadmask = PETSC_OFFLOAD_GPU;
3614   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3615   /* shorter version of MatAssemblyEnd_SeqAIJ */
3616   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3617   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3618   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3619   a->reallocs         = 0;
3620   A->info.mallocs    += 0;
3621   A->info.nz_unneeded = 0;
3622   A->assembled = A->was_assembled = PETSC_TRUE;
3623   A->num_ass++;
3624   PetscFunctionReturn(0);
3625 }
3626 
3627 PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3628 {
3629   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3630   PetscErrorCode     ierr;
3631 
3632   PetscFunctionBegin;
3633   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3634   if (!cusp) PetscFunctionReturn(0);
3635   if (destroy) {
3636     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3637     delete cusp->csr2csc_i;
3638     cusp->csr2csc_i = NULL;
3639   }
3640   cusp->transupdated = PETSC_FALSE;
3641   PetscFunctionReturn(0);
3642 }
3643 
3644 #include <thrust/binary_search.h>
3645 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3646 {
3647   PetscErrorCode     ierr;
3648   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3649   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3650   PetscInt           cooPerm_n, nzr = 0;
3651   cudaError_t        cerr;
3652 
3653   PetscFunctionBegin;
3654   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3655   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3656   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3657   if (n != cooPerm_n) {
3658     delete cusp->cooPerm;
3659     delete cusp->cooPerm_a;
3660     cusp->cooPerm = NULL;
3661     cusp->cooPerm_a = NULL;
3662   }
3663   if (n) {
3664     THRUSTINTARRAY d_i(n);
3665     THRUSTINTARRAY d_j(n);
3666     THRUSTINTARRAY ii(A->rmap->n);
3667 
3668     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3669     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3670 
3671     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3672     d_i.assign(coo_i,coo_i+n);
3673     d_j.assign(coo_j,coo_j+n);
3674     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3675     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3676 
3677     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3678     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3679     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3680     *cusp->cooPerm_a = d_i;
3681     THRUSTINTARRAY w = d_j;
3682 
3683     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3684     if (nekey == ekey) { /* all entries are unique */
3685       delete cusp->cooPerm_a;
3686       cusp->cooPerm_a = NULL;
3687     } else { /* I couldn't come up with a more elegant algorithm */
3688       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3689       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3690       (*cusp->cooPerm_a)[0] = 0;
3691       w[0] = 0;
3692       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3693       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3694     }
3695     thrust::counting_iterator<PetscInt> search_begin(0);
3696     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3697                         search_begin, search_begin + A->rmap->n,
3698                         ii.begin());
3699     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3700     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3701 
3702     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3703     a->singlemalloc = PETSC_FALSE;
3704     a->free_a       = PETSC_TRUE;
3705     a->free_ij      = PETSC_TRUE;
3706     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3707     a->i[0] = 0;
3708     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3709     a->nz = a->maxnz = a->i[A->rmap->n];
3710     a->rmax = 0;
3711     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3712     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3713     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3714     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3715     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3716     for (PetscInt i = 0; i < A->rmap->n; i++) {
3717       const PetscInt nnzr = a->i[i+1] - a->i[i];
3718       nzr += (PetscInt)!!(nnzr);
3719       a->ilen[i] = a->imax[i] = nnzr;
3720       a->rmax = PetscMax(a->rmax,nnzr);
3721     }
3722     a->nonzerorowcnt = nzr;
3723     A->preallocated = PETSC_TRUE;
3724     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3725     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3726   } else {
3727     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3728   }
3729   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3730 
3731   /* We want to allocate the CUSPARSE struct for matvec now.
3732      The code is so convoluted now that I prefer to copy zeros */
3733   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3734   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3735   A->offloadmask = PETSC_OFFLOAD_CPU;
3736   A->nonzerostate++;
3737   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3738   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
3739 
3740   A->assembled = PETSC_FALSE;
3741   A->was_assembled = PETSC_FALSE;
3742   PetscFunctionReturn(0);
3743 }
3744 
3745 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3746 {
3747   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3748   CsrMatrix          *csr;
3749   PetscErrorCode     ierr;
3750 
3751   PetscFunctionBegin;
3752   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3753   PetscValidPointer(a,2);
3754   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3755   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3756   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3757   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3758   csr = (CsrMatrix*)cusp->mat->mat;
3759   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3760   *a = csr->values->data().get();
3761   PetscFunctionReturn(0);
3762 }
3763 
3764 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3765 {
3766   PetscFunctionBegin;
3767   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3768   PetscValidPointer(a,2);
3769   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3770   *a = NULL;
3771   PetscFunctionReturn(0);
3772 }
3773 
3774 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3775 {
3776   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3777   CsrMatrix          *csr;
3778   PetscErrorCode     ierr;
3779 
3780   PetscFunctionBegin;
3781   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3782   PetscValidPointer(a,2);
3783   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3784   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3785   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3786   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3787   csr = (CsrMatrix*)cusp->mat->mat;
3788   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3789   *a = csr->values->data().get();
3790   A->offloadmask = PETSC_OFFLOAD_GPU;
3791   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3792   PetscFunctionReturn(0);
3793 }
3794 
3795 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3796 {
3797   PetscErrorCode ierr;
3798 
3799   PetscFunctionBegin;
3800   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801   PetscValidPointer(a,2);
3802   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3803   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3804   *a = NULL;
3805   PetscFunctionReturn(0);
3806 }
3807 
3808 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3809 {
3810   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3811   CsrMatrix          *csr;
3812   PetscErrorCode     ierr;
3813 
3814   PetscFunctionBegin;
3815   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3816   PetscValidPointer(a,2);
3817   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3818   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3819   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3820   csr = (CsrMatrix*)cusp->mat->mat;
3821   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3822   *a = csr->values->data().get();
3823   A->offloadmask = PETSC_OFFLOAD_GPU;
3824   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3825   PetscFunctionReturn(0);
3826 }
3827 
3828 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3829 {
3830   PetscErrorCode ierr;
3831 
3832   PetscFunctionBegin;
3833   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3834   PetscValidPointer(a,2);
3835   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3836   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3837   *a = NULL;
3838   PetscFunctionReturn(0);
3839 }
3840 
3841 struct IJCompare4
3842 {
3843   __host__ __device__
3844   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3845   {
3846     if (t1.get<0>() < t2.get<0>()) return true;
3847     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3848     return false;
3849   }
3850 };
3851 
3852 struct Shift
3853 {
3854   int _shift;
3855 
3856   Shift(int shift) : _shift(shift) {}
3857   __host__ __device__
3858   inline int operator() (const int &c)
3859   {
3860     return c + _shift;
3861   }
3862 };
3863 
3864 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3865 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3866 {
3867   PetscErrorCode               ierr;
3868   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3869   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3870   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3871   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3872   PetscInt                     Annz,Bnnz;
3873   cusparseStatus_t             stat;
3874   PetscInt                     i,m,n,zero = 0;
3875   cudaError_t                  cerr;
3876 
3877   PetscFunctionBegin;
3878   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3879   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3880   PetscValidPointer(C,4);
3881   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3882   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3883   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3884   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3885   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3886   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3887   if (reuse == MAT_INITIAL_MATRIX) {
3888     m     = A->rmap->n;
3889     n     = A->cmap->n + B->cmap->n;
3890     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3891     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3892     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3893     c     = (Mat_SeqAIJ*)(*C)->data;
3894     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3895     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3896     Ccsr  = new CsrMatrix;
3897     Cmat->cprowIndices      = NULL;
3898     c->compressedrow.use    = PETSC_FALSE;
3899     c->compressedrow.nrows  = 0;
3900     c->compressedrow.i      = NULL;
3901     c->compressedrow.rindex = NULL;
3902     Ccusp->workVector       = NULL;
3903     Ccusp->nrows    = m;
3904     Ccusp->mat      = Cmat;
3905     Ccusp->mat->mat = Ccsr;
3906     Ccsr->num_rows  = m;
3907     Ccsr->num_cols  = n;
3908     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3909     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3910     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3911     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3912     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3913     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3914     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3915     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3916     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3917     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3918     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3919     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3920     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3921     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3922     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3923 
3924     Acsr = (CsrMatrix*)Acusp->mat->mat;
3925     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3926     Annz = (PetscInt)Acsr->column_indices->size();
3927     Bnnz = (PetscInt)Bcsr->column_indices->size();
3928     c->nz = Annz + Bnnz;
3929     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3930     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3931     Ccsr->values = new THRUSTARRAY(c->nz);
3932     Ccsr->num_entries = c->nz;
3933     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3934     if (c->nz) {
3935       auto Acoo = new THRUSTINTARRAY32(Annz);
3936       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3937       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3938       THRUSTINTARRAY32 *Aroff,*Broff;
3939 
3940       if (a->compressedrow.use) { /* need full row offset */
3941         if (!Acusp->rowoffsets_gpu) {
3942           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3943           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3944           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3945         }
3946         Aroff = Acusp->rowoffsets_gpu;
3947       } else Aroff = Acsr->row_offsets;
3948       if (b->compressedrow.use) { /* need full row offset */
3949         if (!Bcusp->rowoffsets_gpu) {
3950           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3951           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3952           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3953         }
3954         Broff = Bcusp->rowoffsets_gpu;
3955       } else Broff = Bcsr->row_offsets;
3956       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3957       stat = cusparseXcsr2coo(Acusp->handle,
3958                               Aroff->data().get(),
3959                               Annz,
3960                               m,
3961                               Acoo->data().get(),
3962                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3963       stat = cusparseXcsr2coo(Bcusp->handle,
3964                               Broff->data().get(),
3965                               Bnnz,
3966                               m,
3967                               Bcoo->data().get(),
3968                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3969       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3970       auto Aperm = thrust::make_constant_iterator(1);
3971       auto Bperm = thrust::make_constant_iterator(0);
3972 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3973       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3974       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3975 #else
3976       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3977       auto Bcib = Bcsr->column_indices->begin();
3978       auto Bcie = Bcsr->column_indices->end();
3979       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3980 #endif
3981       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3982       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3983       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3984       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3985       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3986       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3987       auto p1 = Ccusp->cooPerm->begin();
3988       auto p2 = Ccusp->cooPerm->begin();
3989       thrust::advance(p2,Annz);
3990       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3991 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3992       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3993 #endif
3994       auto cci = thrust::make_counting_iterator(zero);
3995       auto cce = thrust::make_counting_iterator(c->nz);
3996 #if 0 //Errors on SUMMIT cuda 11.1.0
3997       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3998 #else
3999       auto pred = thrust::identity<int>();
4000       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
4001       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
4002 #endif
4003       stat = cusparseXcoo2csr(Ccusp->handle,
4004                               Ccoo->data().get(),
4005                               c->nz,
4006                               m,
4007                               Ccsr->row_offsets->data().get(),
4008                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4009       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4010       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4011       delete wPerm;
4012       delete Acoo;
4013       delete Bcoo;
4014       delete Ccoo;
4015 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4016       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4017                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4018                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4019                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4020 #endif
4021       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4022         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4023         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4024         CsrMatrix *CcsrT = new CsrMatrix;
4025         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4026         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4027 
4028         Ccusp->transgen = PETSC_TRUE;
4029         Ccusp->transupdated = PETSC_TRUE;
4030         Ccusp->rowoffsets_gpu = NULL;
4031         CmatT->cprowIndices = NULL;
4032         CmatT->mat = CcsrT;
4033         CcsrT->num_rows = n;
4034         CcsrT->num_cols = m;
4035         CcsrT->num_entries = c->nz;
4036 
4037         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4038         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4039         CcsrT->values = new THRUSTARRAY(c->nz);
4040 
4041         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4042         auto rT = CcsrT->row_offsets->begin();
4043         if (AT) {
4044           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4045           thrust::advance(rT,-1);
4046         }
4047         if (BT) {
4048           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4049           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4050           thrust::copy(titb,tite,rT);
4051         }
4052         auto cT = CcsrT->column_indices->begin();
4053         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4054         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4055         auto vT = CcsrT->values->begin();
4056         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4057         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4058         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4059         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4060 
4061         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4062         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4063         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4064         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4065         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4066         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4067         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4068         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4069         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4070 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4071         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4072                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4073                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4074                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4075 #endif
4076         Ccusp->matTranspose = CmatT;
4077       }
4078     }
4079 
4080     c->singlemalloc = PETSC_FALSE;
4081     c->free_a       = PETSC_TRUE;
4082     c->free_ij      = PETSC_TRUE;
4083     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4084     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4085     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4086       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4087       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4088       ii   = *Ccsr->row_offsets;
4089       jj   = *Ccsr->column_indices;
4090       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4091       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4092     } else {
4093       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4094       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4095     }
4096     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4097     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4098     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4099     c->maxnz = c->nz;
4100     c->nonzerorowcnt = 0;
4101     c->rmax = 0;
4102     for (i = 0; i < m; i++) {
4103       const PetscInt nn = c->i[i+1] - c->i[i];
4104       c->ilen[i] = c->imax[i] = nn;
4105       c->nonzerorowcnt += (PetscInt)!!nn;
4106       c->rmax = PetscMax(c->rmax,nn);
4107     }
4108     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4109     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4110     (*C)->nonzerostate++;
4111     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4112     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4113     Ccusp->nonzerostate = (*C)->nonzerostate;
4114     (*C)->preallocated  = PETSC_TRUE;
4115   } else {
4116     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4117     c = (Mat_SeqAIJ*)(*C)->data;
4118     if (c->nz) {
4119       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4120       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4121       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4122       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4123       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4124       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4125       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4126       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4127       Acsr = (CsrMatrix*)Acusp->mat->mat;
4128       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4129       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4130       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4131       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4132       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4133       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4134       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4135       auto pmid = Ccusp->cooPerm->begin();
4136       thrust::advance(pmid,Acsr->num_entries);
4137       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4138       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4139                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4140       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4141                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4142       thrust::for_each(zibait,zieait,VecCUDAEquals());
4143       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4144                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4145       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4146                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4147       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4148       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4149       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4150         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4151         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4152         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4153         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4154         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4155         auto vT = CcsrT->values->begin();
4156         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4157         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4158         Ccusp->transupdated = PETSC_TRUE;
4159       }
4160       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4161       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4162     }
4163   }
4164   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4165   (*C)->assembled     = PETSC_TRUE;
4166   (*C)->was_assembled = PETSC_FALSE;
4167   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4168   PetscFunctionReturn(0);
4169 }
4170 
4171 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4172 {
4173   PetscErrorCode    ierr;
4174   bool              dmem;
4175   const PetscScalar *av;
4176   cudaError_t       cerr;
4177 
4178   PetscFunctionBegin;
4179   dmem = isCudaMem(v);
4180   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4181   if (n && idx) {
4182     THRUSTINTARRAY widx(n);
4183     widx.assign(idx,idx+n);
4184     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4185 
4186     THRUSTARRAY *w = NULL;
4187     thrust::device_ptr<PetscScalar> dv;
4188     if (dmem) {
4189       dv = thrust::device_pointer_cast(v);
4190     } else {
4191       w = new THRUSTARRAY(n);
4192       dv = w->data();
4193     }
4194     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4195 
4196     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4197     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4198     thrust::for_each(zibit,zieit,VecCUDAEquals());
4199     if (w) {
4200       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4201     }
4202     delete w;
4203   } else {
4204     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4205   }
4206   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4207   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4208   PetscFunctionReturn(0);
4209 }
4210