xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 764761ab5fe8cb07697d3041208cba61aa615fcc)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
69 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
70 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
76 
77 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
83 
84 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
85 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86 
87 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
88 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
89 
90 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91 
92 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93 {
94   cusparseStatus_t   stat;
95   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96 
97   PetscFunctionBegin;
98   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99   cusparsestruct->stream = stream;
100   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101   PetscFunctionReturn(0);
102 }
103 
104 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105 {
106   cusparseStatus_t   stat;
107   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108 
109   PetscFunctionBegin;
110   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
111   if (cusparsestruct->handle != handle) {
112     if (cusparsestruct->handle) {
113       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
114     }
115     cusparsestruct->handle = handle;
116   }
117   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118   PetscFunctionReturn(0);
119 }
120 
121 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122 {
123   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
124   PetscBool          flg;
125   PetscErrorCode     ierr;
126 
127   PetscFunctionBegin;
128   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
129   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131   PetscFunctionReturn(0);
132 }
133 
134 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
135 {
136   PetscFunctionBegin;
137   *type = MATSOLVERCUSPARSE;
138   PetscFunctionReturn(0);
139 }
140 
141 /*MC
142   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147   algorithms are not recommended. This class does NOT support direct solver operations.
148 
149   Level: beginner
150 
151 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152 M*/
153 
154 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
155 {
156   PetscErrorCode ierr;
157   PetscInt       n = A->rmap->n;
158 
159   PetscFunctionBegin;
160   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
162   (*B)->factortype = ftype;
163   (*B)->useordering = PETSC_TRUE;
164   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
165 
166   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
167     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
168     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
170   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
171     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
172     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
173   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
174 
175   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
176   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
177   PetscFunctionReturn(0);
178 }
179 
180 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
181 {
182   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
183 
184   PetscFunctionBegin;
185   switch (op) {
186   case MAT_CUSPARSE_MULT:
187     cusparsestruct->format = format;
188     break;
189   case MAT_CUSPARSE_ALL:
190     cusparsestruct->format = format;
191     break;
192   default:
193     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
194   }
195   PetscFunctionReturn(0);
196 }
197 
198 /*@
199    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
200    operation. Only the MatMult operation can use different GPU storage formats
201    for MPIAIJCUSPARSE matrices.
202    Not Collective
203 
204    Input Parameters:
205 +  A - Matrix of type SEQAIJCUSPARSE
206 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
207 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
208 
209    Output Parameter:
210 
211    Level: intermediate
212 
213 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
214 @*/
215 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
216 {
217   PetscErrorCode ierr;
218 
219   PetscFunctionBegin;
220   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
221   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 /*@
226    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
227 
228    Collective on mat
229 
230    Input Parameters:
231 +  A - Matrix of type SEQAIJCUSPARSE
232 -  transgen - the boolean flag
233 
234    Level: intermediate
235 
236 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
237 @*/
238 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
239 {
240   PetscErrorCode ierr;
241   PetscBool      flg;
242 
243   PetscFunctionBegin;
244   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
246   if (flg) {
247     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
248 
249     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
250     cusp->transgen = transgen;
251     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
252       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
253     }
254   }
255   PetscFunctionReturn(0);
256 }
257 
258 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
259 {
260   PetscErrorCode           ierr;
261   MatCUSPARSEStorageFormat format;
262   PetscBool                flg;
263   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
264 
265   PetscFunctionBegin;
266   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
267   if (A->factortype == MAT_FACTOR_NONE) {
268     PetscBool transgen = cusparsestruct->transgen;
269 
270     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
271     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
272 
273     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
274                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
275     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
276 
277     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
278                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
279     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
280    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
281     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
282     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
283                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
284     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
285     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
286 
287     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
288     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
289                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
290     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
291 
292     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
293     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
294                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
295     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
296    #endif
297   }
298   ierr = PetscOptionsTail();CHKERRQ(ierr);
299   PetscFunctionReturn(0);
300 }
301 
302 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
303 {
304   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
305   PetscErrorCode               ierr;
306 
307   PetscFunctionBegin;
308   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
309   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
310   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
311   PetscFunctionReturn(0);
312 }
313 
314 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
315 {
316   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
317   PetscErrorCode               ierr;
318 
319   PetscFunctionBegin;
320   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
321   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
322   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
323   PetscFunctionReturn(0);
324 }
325 
326 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
327 {
328   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
329   PetscErrorCode               ierr;
330 
331   PetscFunctionBegin;
332   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
333   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
334   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
335   PetscFunctionReturn(0);
336 }
337 
338 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
339 {
340   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
341   PetscErrorCode               ierr;
342 
343   PetscFunctionBegin;
344   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
345   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
346   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
347   PetscFunctionReturn(0);
348 }
349 
350 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
351 {
352   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
353   PetscInt                          n = A->rmap->n;
354   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
355   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
356   cusparseStatus_t                  stat;
357   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
358   const MatScalar                   *aa = a->a,*v;
359   PetscInt                          *AiLo, *AjLo;
360   PetscInt                          i,nz, nzLower, offset, rowOffset;
361   PetscErrorCode                    ierr;
362   cudaError_t                       cerr;
363 
364   PetscFunctionBegin;
365   if (!n) PetscFunctionReturn(0);
366   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
367     try {
368       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
369       nzLower=n+ai[n]-ai[1];
370       if (!loTriFactor) {
371         PetscScalar                       *AALo;
372 
373         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
374 
375         /* Allocate Space for the lower triangular matrix */
376         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
377         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
378 
379         /* Fill the lower triangular matrix */
380         AiLo[0]  = (PetscInt) 0;
381         AiLo[n]  = nzLower;
382         AjLo[0]  = (PetscInt) 0;
383         AALo[0]  = (MatScalar) 1.0;
384         v        = aa;
385         vi       = aj;
386         offset   = 1;
387         rowOffset= 1;
388         for (i=1; i<n; i++) {
389           nz = ai[i+1] - ai[i];
390           /* additional 1 for the term on the diagonal */
391           AiLo[i]    = rowOffset;
392           rowOffset += nz+1;
393 
394           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
395           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
396 
397           offset      += nz;
398           AjLo[offset] = (PetscInt) i;
399           AALo[offset] = (MatScalar) 1.0;
400           offset      += 1;
401 
402           v  += nz;
403           vi += nz;
404         }
405 
406         /* allocate space for the triangular factor information */
407         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
408         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
409         /* Create the matrix description */
410         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
411         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
412        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
413         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
414        #else
415         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
416        #endif
417         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
418         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
419 
420         /* set the operation */
421         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
422 
423         /* set the matrix */
424         loTriFactor->csrMat = new CsrMatrix;
425         loTriFactor->csrMat->num_rows = n;
426         loTriFactor->csrMat->num_cols = n;
427         loTriFactor->csrMat->num_entries = nzLower;
428 
429         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
430         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
431 
432         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
433         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
434 
435         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
436         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
437 
438         /* Create the solve analysis information */
439         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
440         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
441       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
443                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
444                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
445                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
446                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
447         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
448       #endif
449 
450         /* perform the solve analysis */
451         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
452                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
453                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
454                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
455                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
456                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
457                                #endif
458 );CHKERRCUSPARSE(stat);
459         cerr = WaitForCUDA();CHKERRCUDA(cerr);
460         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
461 
462         /* assign the pointer */
463         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
464         loTriFactor->AA_h = AALo;
465         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
466         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
467         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
468       } else { /* update values only */
469         if (!loTriFactor->AA_h) {
470           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
471         }
472         /* Fill the lower triangular matrix */
473         loTriFactor->AA_h[0]  = 1.0;
474         v        = aa;
475         vi       = aj;
476         offset   = 1;
477         for (i=1; i<n; i++) {
478           nz = ai[i+1] - ai[i];
479           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
480           offset      += nz;
481           loTriFactor->AA_h[offset] = 1.0;
482           offset      += 1;
483           v  += nz;
484         }
485         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
486         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
487       }
488     } catch(char *ex) {
489       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
490     }
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
496 {
497   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
498   PetscInt                          n = A->rmap->n;
499   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
500   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
501   cusparseStatus_t                  stat;
502   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
503   const MatScalar                   *aa = a->a,*v;
504   PetscInt                          *AiUp, *AjUp;
505   PetscInt                          i,nz, nzUpper, offset;
506   PetscErrorCode                    ierr;
507   cudaError_t                       cerr;
508 
509   PetscFunctionBegin;
510   if (!n) PetscFunctionReturn(0);
511   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
512     try {
513       /* next, figure out the number of nonzeros in the upper triangular matrix. */
514       nzUpper = adiag[0]-adiag[n];
515       if (!upTriFactor) {
516         PetscScalar *AAUp;
517 
518         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
519 
520         /* Allocate Space for the upper triangular matrix */
521         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
522         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
523 
524         /* Fill the upper triangular matrix */
525         AiUp[0]=(PetscInt) 0;
526         AiUp[n]=nzUpper;
527         offset = nzUpper;
528         for (i=n-1; i>=0; i--) {
529           v  = aa + adiag[i+1] + 1;
530           vi = aj + adiag[i+1] + 1;
531 
532           /* number of elements NOT on the diagonal */
533           nz = adiag[i] - adiag[i+1]-1;
534 
535           /* decrement the offset */
536           offset -= (nz+1);
537 
538           /* first, set the diagonal elements */
539           AjUp[offset] = (PetscInt) i;
540           AAUp[offset] = (MatScalar)1./v[nz];
541           AiUp[i]      = AiUp[i+1] - (nz+1);
542 
543           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
544           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
545         }
546 
547         /* allocate space for the triangular factor information */
548         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
549         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
550 
551         /* Create the matrix description */
552         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
553         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
554        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
555         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
556        #else
557         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
558        #endif
559         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
560         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
561 
562         /* set the operation */
563         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
564 
565         /* set the matrix */
566         upTriFactor->csrMat = new CsrMatrix;
567         upTriFactor->csrMat->num_rows = n;
568         upTriFactor->csrMat->num_cols = n;
569         upTriFactor->csrMat->num_entries = nzUpper;
570 
571         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
572         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
573 
574         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
575         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
576 
577         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
578         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
579 
580         /* Create the solve analysis information */
581         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
582         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
583       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
584         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
585                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
586                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
587                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
588                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
589         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
590       #endif
591 
592         /* perform the solve analysis */
593         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
594                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
595                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
596                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
597                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
598                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
599                                #endif
600 );CHKERRCUSPARSE(stat);
601         cerr = WaitForCUDA();CHKERRCUDA(cerr);
602         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
603 
604         /* assign the pointer */
605         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
606         upTriFactor->AA_h = AAUp;
607         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
608         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
609         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
610       } else {
611         if (!upTriFactor->AA_h) {
612           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
613         }
614         /* Fill the upper triangular matrix */
615         offset = nzUpper;
616         for (i=n-1; i>=0; i--) {
617           v  = aa + adiag[i+1] + 1;
618 
619           /* number of elements NOT on the diagonal */
620           nz = adiag[i] - adiag[i+1]-1;
621 
622           /* decrement the offset */
623           offset -= (nz+1);
624 
625           /* first, set the diagonal elements */
626           upTriFactor->AA_h[offset] = 1./v[nz];
627           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
628         }
629         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
630         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
631       }
632     } catch(char *ex) {
633       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
634     }
635   }
636   PetscFunctionReturn(0);
637 }
638 
639 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
640 {
641   PetscErrorCode               ierr;
642   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
643   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
644   IS                           isrow = a->row,iscol = a->icol;
645   PetscBool                    row_identity,col_identity;
646   PetscInt                     n = A->rmap->n;
647 
648   PetscFunctionBegin;
649   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
650   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
651   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
652 
653   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
654   cusparseTriFactors->nnz=a->nz;
655 
656   A->offloadmask = PETSC_OFFLOAD_BOTH;
657   /* lower triangular indices */
658   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
659   if (!row_identity && !cusparseTriFactors->rpermIndices) {
660     const PetscInt *r;
661 
662     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
663     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
664     cusparseTriFactors->rpermIndices->assign(r, r+n);
665     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
666     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
667   }
668 
669   /* upper triangular indices */
670   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
671   if (!col_identity && !cusparseTriFactors->cpermIndices) {
672     const PetscInt *c;
673 
674     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
675     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
676     cusparseTriFactors->cpermIndices->assign(c, c+n);
677     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
678     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
679   }
680   PetscFunctionReturn(0);
681 }
682 
683 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
684 {
685   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
686   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
687   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
688   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
689   cusparseStatus_t                  stat;
690   PetscErrorCode                    ierr;
691   cudaError_t                       cerr;
692   PetscInt                          *AiUp, *AjUp;
693   PetscScalar                       *AAUp;
694   PetscScalar                       *AALo;
695   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
696   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
697   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
698   const MatScalar                   *aa = b->a,*v;
699 
700   PetscFunctionBegin;
701   if (!n) PetscFunctionReturn(0);
702   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
703     try {
704       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
706       if (!upTriFactor && !loTriFactor) {
707         /* Allocate Space for the upper triangular matrix */
708         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
709         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
710 
711         /* Fill the upper triangular matrix */
712         AiUp[0]=(PetscInt) 0;
713         AiUp[n]=nzUpper;
714         offset = 0;
715         for (i=0; i<n; i++) {
716           /* set the pointers */
717           v  = aa + ai[i];
718           vj = aj + ai[i];
719           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
720 
721           /* first, set the diagonal elements */
722           AjUp[offset] = (PetscInt) i;
723           AAUp[offset] = (MatScalar)1.0/v[nz];
724           AiUp[i]      = offset;
725           AALo[offset] = (MatScalar)1.0/v[nz];
726 
727           offset+=1;
728           if (nz>0) {
729             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
730             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
731             for (j=offset; j<offset+nz; j++) {
732               AAUp[j] = -AAUp[j];
733               AALo[j] = AAUp[j]/v[nz];
734             }
735             offset+=nz;
736           }
737         }
738 
739         /* allocate space for the triangular factor information */
740         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
741         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
742 
743         /* Create the matrix description */
744         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
745         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
746        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
748        #else
749         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
750        #endif
751         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
752         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
753 
754         /* set the matrix */
755         upTriFactor->csrMat = new CsrMatrix;
756         upTriFactor->csrMat->num_rows = A->rmap->n;
757         upTriFactor->csrMat->num_cols = A->cmap->n;
758         upTriFactor->csrMat->num_entries = a->nz;
759 
760         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
761         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
762 
763         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
764         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
765 
766         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
767         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
768 
769         /* set the operation */
770         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
771 
772         /* Create the solve analysis information */
773         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
774         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
775       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
777                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
778                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
779                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
780                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
781         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
782       #endif
783 
784         /* perform the solve analysis */
785         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
786                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
787                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
788                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
789                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
790                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
791                                 #endif
792 );CHKERRCUSPARSE(stat);
793         cerr = WaitForCUDA();CHKERRCUDA(cerr);
794         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
795 
796         /* assign the pointer */
797         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
798 
799         /* allocate space for the triangular factor information */
800         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
801         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
802 
803         /* Create the matrix description */
804         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
805         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
806        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
808        #else
809         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
810        #endif
811         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
812         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
813 
814         /* set the operation */
815         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
816 
817         /* set the matrix */
818         loTriFactor->csrMat = new CsrMatrix;
819         loTriFactor->csrMat->num_rows = A->rmap->n;
820         loTriFactor->csrMat->num_cols = A->cmap->n;
821         loTriFactor->csrMat->num_entries = a->nz;
822 
823         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
824         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
825 
826         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
827         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
828 
829         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
830         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
831 
832         /* Create the solve analysis information */
833         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
834         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
835       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
837                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
838                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
839                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
840                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
841         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
842       #endif
843 
844         /* perform the solve analysis */
845         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
846                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
847                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
848                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
849                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
850                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
851                                 #endif
852 );CHKERRCUSPARSE(stat);
853         cerr = WaitForCUDA();CHKERRCUDA(cerr);
854         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855 
856         /* assign the pointer */
857         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
858 
859         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
860         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
861         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
862       } else {
863         /* Fill the upper triangular matrix */
864         offset = 0;
865         for (i=0; i<n; i++) {
866           /* set the pointers */
867           v  = aa + ai[i];
868           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
869 
870           /* first, set the diagonal elements */
871           AAUp[offset] = 1.0/v[nz];
872           AALo[offset] = 1.0/v[nz];
873 
874           offset+=1;
875           if (nz>0) {
876             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
877             for (j=offset; j<offset+nz; j++) {
878               AAUp[j] = -AAUp[j];
879               AALo[j] = AAUp[j]/v[nz];
880             }
881             offset+=nz;
882           }
883         }
884         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
886         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
887         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
888         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
889       }
890       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
891       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
892     } catch(char *ex) {
893       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
894     }
895   }
896   PetscFunctionReturn(0);
897 }
898 
899 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
900 {
901   PetscErrorCode               ierr;
902   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
903   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
904   IS                           ip = a->row;
905   PetscBool                    perm_identity;
906   PetscInt                     n = A->rmap->n;
907 
908   PetscFunctionBegin;
909   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
910   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
911   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
912   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
913 
914   A->offloadmask = PETSC_OFFLOAD_BOTH;
915 
916   /* lower triangular indices */
917   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
918   if (!perm_identity) {
919     IS             iip;
920     const PetscInt *irip,*rip;
921 
922     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
923     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
924     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
925     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
926     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
927     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
928     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
929     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
930     ierr = ISDestroy(&iip);CHKERRQ(ierr);
931     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
932     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
933   }
934   PetscFunctionReturn(0);
935 }
936 
937 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
938 {
939   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
940   IS             isrow = b->row,iscol = b->col;
941   PetscBool      row_identity,col_identity;
942   PetscErrorCode ierr;
943 
944   PetscFunctionBegin;
945   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
946   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
947   B->offloadmask = PETSC_OFFLOAD_CPU;
948   /* determine which version of MatSolve needs to be used. */
949   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
950   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
951   if (row_identity && col_identity) {
952     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
953     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
954     B->ops->matsolve = NULL;
955     B->ops->matsolvetranspose = NULL;
956   } else {
957     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
958     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
959     B->ops->matsolve = NULL;
960     B->ops->matsolvetranspose = NULL;
961   }
962 
963   /* get the triangular factors */
964   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
965   PetscFunctionReturn(0);
966 }
967 
968 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
969 {
970   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
971   IS             ip = b->row;
972   PetscBool      perm_identity;
973   PetscErrorCode ierr;
974 
975   PetscFunctionBegin;
976   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
977   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
978   B->offloadmask = PETSC_OFFLOAD_CPU;
979   /* determine which version of MatSolve needs to be used. */
980   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
981   if (perm_identity) {
982     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
983     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
984     B->ops->matsolve = NULL;
985     B->ops->matsolvetranspose = NULL;
986   } else {
987     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
988     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
989     B->ops->matsolve = NULL;
990     B->ops->matsolvetranspose = NULL;
991   }
992 
993   /* get the triangular factors */
994   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
995   PetscFunctionReturn(0);
996 }
997 
998 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
999 {
1000   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1002   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1003   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1004   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1005   cusparseStatus_t                  stat;
1006   cusparseIndexBase_t               indexBase;
1007   cusparseMatrixType_t              matrixType;
1008   cusparseFillMode_t                fillMode;
1009   cusparseDiagType_t                diagType;
1010   cudaError_t                       cerr;
1011   PetscErrorCode                    ierr;
1012 
1013   PetscFunctionBegin;
1014   /* allocate space for the transpose of the lower triangular factor */
1015   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1016   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1017 
1018   /* set the matrix descriptors of the lower triangular factor */
1019   matrixType = cusparseGetMatType(loTriFactor->descr);
1020   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1021   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1022     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1023   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1024 
1025   /* Create the matrix description */
1026   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1028   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1029   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1030   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1031 
1032   /* set the operation */
1033   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1034 
1035   /* allocate GPU space for the CSC of the lower triangular factor*/
1036   loTriFactorT->csrMat = new CsrMatrix;
1037   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1038   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1039   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1040   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1041   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1042   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1043 
1044   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1045 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1046   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1047                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1048                                        loTriFactor->csrMat->values->data().get(),
1049                                        loTriFactor->csrMat->row_offsets->data().get(),
1050                                        loTriFactor->csrMat->column_indices->data().get(),
1051                                        loTriFactorT->csrMat->values->data().get(),
1052                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1054                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1055   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1056 #endif
1057 
1058   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1059   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1060                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1061                           loTriFactor->csrMat->values->data().get(),
1062                           loTriFactor->csrMat->row_offsets->data().get(),
1063                           loTriFactor->csrMat->column_indices->data().get(),
1064                           loTriFactorT->csrMat->values->data().get(),
1065                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1066                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1067                           CUSPARSE_ACTION_NUMERIC, indexBase,
1068                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1069                         #else
1070                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1071                           CUSPARSE_ACTION_NUMERIC, indexBase
1072                         #endif
1073 );CHKERRCUSPARSE(stat);
1074   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1075   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1076 
1077   /* Create the solve analysis information */
1078   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1079   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1080 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1081   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1085                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1086   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1087 #endif
1088 
1089   /* perform the solve analysis */
1090   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1091                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1092                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1093                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1094                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1095                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1096                           #endif
1097 );CHKERRCUSPARSE(stat);
1098   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1099   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1100 
1101   /* assign the pointer */
1102   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1103 
1104   /*********************************************/
1105   /* Now the Transpose of the Upper Tri Factor */
1106   /*********************************************/
1107 
1108   /* allocate space for the transpose of the upper triangular factor */
1109   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1110   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1111 
1112   /* set the matrix descriptors of the upper triangular factor */
1113   matrixType = cusparseGetMatType(upTriFactor->descr);
1114   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1115   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1116     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1117   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1118 
1119   /* Create the matrix description */
1120   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1122   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1123   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1124   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1125 
1126   /* set the operation */
1127   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1128 
1129   /* allocate GPU space for the CSC of the upper triangular factor*/
1130   upTriFactorT->csrMat = new CsrMatrix;
1131   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1132   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1133   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1134   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1135   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1136   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1137 
1138   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1139 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1140   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1141                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142                                 upTriFactor->csrMat->values->data().get(),
1143                                 upTriFactor->csrMat->row_offsets->data().get(),
1144                                 upTriFactor->csrMat->column_indices->data().get(),
1145                                 upTriFactorT->csrMat->values->data().get(),
1146                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1148                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1149   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1150 #endif
1151 
1152   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1153   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1154                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1155                           upTriFactor->csrMat->values->data().get(),
1156                           upTriFactor->csrMat->row_offsets->data().get(),
1157                           upTriFactor->csrMat->column_indices->data().get(),
1158                           upTriFactorT->csrMat->values->data().get(),
1159                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1160                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1161                           CUSPARSE_ACTION_NUMERIC, indexBase,
1162                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1163                         #else
1164                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165                           CUSPARSE_ACTION_NUMERIC, indexBase
1166                         #endif
1167 );CHKERRCUSPARSE(stat);
1168   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1169   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1170 
1171   /* Create the solve analysis information */
1172   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1173   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1174   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1175   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1176                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1177                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1178                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1179                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1180   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1181   #endif
1182 
1183   /* perform the solve analysis */
1184   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1185                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1186                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1187                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1188                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1189                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1190                           #endif
1191 );CHKERRCUSPARSE(stat);
1192   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1193   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1194 
1195   /* assign the pointer */
1196   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1197   PetscFunctionReturn(0);
1198 }
1199 
1200 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1201 {
1202   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1204   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1205   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1206   cusparseStatus_t             stat;
1207   cusparseIndexBase_t          indexBase;
1208   cudaError_t                  err;
1209   PetscErrorCode               ierr;
1210 
1211   PetscFunctionBegin;
1212   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1213   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1214   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1215   /* create cusparse matrix */
1216   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1217   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1218   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1219   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1220   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1221 
1222   /* set alpha and beta */
1223   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1224   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1225   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1226   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1227   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1228   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1229   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1230 
1231   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1232     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1233     CsrMatrix *matrixT= new CsrMatrix;
1234     matrixT->num_rows = A->cmap->n;
1235     matrixT->num_cols = A->rmap->n;
1236     matrixT->num_entries = a->nz;
1237     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239     matrixT->values = new THRUSTARRAY(a->nz);
1240 
1241     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1242     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243 
1244     /* compute the transpose, i.e. the CSC */
1245    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1246     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1247                                   A->cmap->n, matrix->num_entries,
1248                                   matrix->values->data().get(),
1249                                   cusparsestruct->rowoffsets_gpu->data().get(),
1250                                   matrix->column_indices->data().get(),
1251                                   matrixT->values->data().get(),
1252                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1253                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1254                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1255     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1256    #endif
1257 
1258     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1259                             A->cmap->n, matrix->num_entries,
1260                             matrix->values->data().get(),
1261                             cusparsestruct->rowoffsets_gpu->data().get(),
1262                             matrix->column_indices->data().get(),
1263                             matrixT->values->data().get(),
1264                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1265                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1266                             CUSPARSE_ACTION_NUMERIC,indexBase,
1267                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1268                           #else
1269                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1270                             CUSPARSE_ACTION_NUMERIC, indexBase
1271                           #endif
1272 );CHKERRCUSPARSE(stat);
1273     matstructT->mat = matrixT;
1274 
1275    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276     stat = cusparseCreateCsr(&matstructT->matDescr,
1277                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1278                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1279                              matrixT->values->data().get(),
1280                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1281                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1282    #endif
1283   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1284    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1285     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1286    #else
1287     CsrMatrix *temp  = new CsrMatrix;
1288     CsrMatrix *tempT = new CsrMatrix;
1289     /* First convert HYB to CSR */
1290     temp->num_rows = A->rmap->n;
1291     temp->num_cols = A->cmap->n;
1292     temp->num_entries = a->nz;
1293     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1294     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1295     temp->values = new THRUSTARRAY(a->nz);
1296 
1297     stat = cusparse_hyb2csr(cusparsestruct->handle,
1298                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1299                             temp->values->data().get(),
1300                             temp->row_offsets->data().get(),
1301                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1302 
1303     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304     tempT->num_rows = A->rmap->n;
1305     tempT->num_cols = A->cmap->n;
1306     tempT->num_entries = a->nz;
1307     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1308     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309     tempT->values = new THRUSTARRAY(a->nz);
1310 
1311     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1312                             temp->num_cols, temp->num_entries,
1313                             temp->values->data().get(),
1314                             temp->row_offsets->data().get(),
1315                             temp->column_indices->data().get(),
1316                             tempT->values->data().get(),
1317                             tempT->column_indices->data().get(),
1318                             tempT->row_offsets->data().get(),
1319                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1320 
1321     /* Last, convert CSC to HYB */
1322     cusparseHybMat_t hybMat;
1323     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1324     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1325       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1326     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1327                             matstructT->descr, tempT->values->data().get(),
1328                             tempT->row_offsets->data().get(),
1329                             tempT->column_indices->data().get(),
1330                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1331 
1332     /* assign the pointer */
1333     matstructT->mat = hybMat;
1334     /* delete temporaries */
1335     if (tempT) {
1336       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1337       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1338       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1339       delete (CsrMatrix*) tempT;
1340     }
1341     if (temp) {
1342       if (temp->values) delete (THRUSTARRAY*) temp->values;
1343       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1344       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1345       delete (CsrMatrix*) temp;
1346     }
1347    #endif
1348   }
1349   err  = WaitForCUDA();CHKERRCUDA(err);
1350   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1351   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1352   /* the compressed row indices is not used for matTranspose */
1353   matstructT->cprowIndices = NULL;
1354   /* assign the pointer */
1355   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1360 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1361 {
1362   PetscInt                              n = xx->map->n;
1363   const PetscScalar                     *barray;
1364   PetscScalar                           *xarray;
1365   thrust::device_ptr<const PetscScalar> bGPU;
1366   thrust::device_ptr<PetscScalar>       xGPU;
1367   cusparseStatus_t                      stat;
1368   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1369   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1370   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1371   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1372   PetscErrorCode                        ierr;
1373   cudaError_t                           cerr;
1374 
1375   PetscFunctionBegin;
1376   /* Analyze the matrix and create the transpose ... on the fly */
1377   if (!loTriFactorT && !upTriFactorT) {
1378     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1379     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1380     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1381   }
1382 
1383   /* Get the GPU pointers */
1384   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1385   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1386   xGPU = thrust::device_pointer_cast(xarray);
1387   bGPU = thrust::device_pointer_cast(barray);
1388 
1389   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1390   /* First, reorder with the row permutation */
1391   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1392                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1393                xGPU);
1394 
1395   /* First, solve U */
1396   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1397                         upTriFactorT->csrMat->num_rows,
1398                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1399                         upTriFactorT->csrMat->num_entries,
1400                       #endif
1401                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1402                         upTriFactorT->csrMat->values->data().get(),
1403                         upTriFactorT->csrMat->row_offsets->data().get(),
1404                         upTriFactorT->csrMat->column_indices->data().get(),
1405                         upTriFactorT->solveInfo,
1406                         xarray, tempGPU->data().get()
1407                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1409                       #endif
1410 );CHKERRCUSPARSE(stat);
1411 
1412   /* Then, solve L */
1413   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1414                         loTriFactorT->csrMat->num_rows,
1415                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1416                         loTriFactorT->csrMat->num_entries,
1417                       #endif
1418                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1419                         loTriFactorT->csrMat->values->data().get(),
1420                         loTriFactorT->csrMat->row_offsets->data().get(),
1421                         loTriFactorT->csrMat->column_indices->data().get(),
1422                         loTriFactorT->solveInfo,
1423                         tempGPU->data().get(), xarray
1424                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1425                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1426                       #endif
1427 );CHKERRCUSPARSE(stat);
1428 
1429   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1430   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1431                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1432                tempGPU->begin());
1433 
1434   /* Copy the temporary to the full solution. */
1435   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1436 
1437   /* restore */
1438   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1439   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1440   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1441   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1442   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1443   PetscFunctionReturn(0);
1444 }
1445 
1446 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1447 {
1448   const PetscScalar                 *barray;
1449   PetscScalar                       *xarray;
1450   cusparseStatus_t                  stat;
1451   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1452   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1453   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1454   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1455   PetscErrorCode                    ierr;
1456   cudaError_t                       cerr;
1457 
1458   PetscFunctionBegin;
1459   /* Analyze the matrix and create the transpose ... on the fly */
1460   if (!loTriFactorT && !upTriFactorT) {
1461     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1462     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1463     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1464   }
1465 
1466   /* Get the GPU pointers */
1467   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1468   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1469 
1470   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1471   /* First, solve U */
1472   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1473                         upTriFactorT->csrMat->num_rows,
1474                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475                         upTriFactorT->csrMat->num_entries,
1476                       #endif
1477                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1478                         upTriFactorT->csrMat->values->data().get(),
1479                         upTriFactorT->csrMat->row_offsets->data().get(),
1480                         upTriFactorT->csrMat->column_indices->data().get(),
1481                         upTriFactorT->solveInfo,
1482                         barray, tempGPU->data().get()
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1485                       #endif
1486 );CHKERRCUSPARSE(stat);
1487 
1488   /* Then, solve L */
1489   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1490                         loTriFactorT->csrMat->num_rows,
1491                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1492                         loTriFactorT->csrMat->num_entries,
1493                       #endif
1494                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1495                         loTriFactorT->csrMat->values->data().get(),
1496                         loTriFactorT->csrMat->row_offsets->data().get(),
1497                         loTriFactorT->csrMat->column_indices->data().get(),
1498                         loTriFactorT->solveInfo,
1499                         tempGPU->data().get(), xarray
1500                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1501                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1502                       #endif
1503 );CHKERRCUSPARSE(stat);
1504 
1505   /* restore */
1506   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1507   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1508   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1509   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1510   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1511   PetscFunctionReturn(0);
1512 }
1513 
1514 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1515 {
1516   const PetscScalar                     *barray;
1517   PetscScalar                           *xarray;
1518   thrust::device_ptr<const PetscScalar> bGPU;
1519   thrust::device_ptr<PetscScalar>       xGPU;
1520   cusparseStatus_t                      stat;
1521   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1523   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1524   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525   PetscErrorCode                        ierr;
1526   cudaError_t                           cerr;
1527 
1528   PetscFunctionBegin;
1529 
1530   /* Get the GPU pointers */
1531   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1532   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1533   xGPU = thrust::device_pointer_cast(xarray);
1534   bGPU = thrust::device_pointer_cast(barray);
1535 
1536   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1537   /* First, reorder with the row permutation */
1538   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1539                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1540                tempGPU->begin());
1541 
1542   /* Next, solve L */
1543   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1544                         loTriFactor->csrMat->num_rows,
1545                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546                         loTriFactor->csrMat->num_entries,
1547                       #endif
1548                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1549                         loTriFactor->csrMat->values->data().get(),
1550                         loTriFactor->csrMat->row_offsets->data().get(),
1551                         loTriFactor->csrMat->column_indices->data().get(),
1552                         loTriFactor->solveInfo,
1553                         tempGPU->data().get(), xarray
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1556                       #endif
1557 );CHKERRCUSPARSE(stat);
1558 
1559   /* Then, solve U */
1560   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1561                         upTriFactor->csrMat->num_rows,
1562                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563                         upTriFactor->csrMat->num_entries,
1564                       #endif
1565                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1566                         upTriFactor->csrMat->values->data().get(),
1567                         upTriFactor->csrMat->row_offsets->data().get(),
1568                         upTriFactor->csrMat->column_indices->data().get(),
1569                         upTriFactor->solveInfo,
1570                         xarray, tempGPU->data().get()
1571                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1572                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1573                       #endif
1574 );CHKERRCUSPARSE(stat);
1575 
1576   /* Last, reorder with the column permutation */
1577   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1578                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1579                xGPU);
1580 
1581   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1582   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1583   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1584   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1586   PetscFunctionReturn(0);
1587 }
1588 
1589 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1590 {
1591   const PetscScalar                 *barray;
1592   PetscScalar                       *xarray;
1593   cusparseStatus_t                  stat;
1594   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1596   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1597   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598   PetscErrorCode                    ierr;
1599   cudaError_t                       cerr;
1600 
1601   PetscFunctionBegin;
1602   /* Get the GPU pointers */
1603   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1604   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1605 
1606   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1607   /* First, solve L */
1608   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1609                         loTriFactor->csrMat->num_rows,
1610                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611                         loTriFactor->csrMat->num_entries,
1612                       #endif
1613                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1614                         loTriFactor->csrMat->values->data().get(),
1615                         loTriFactor->csrMat->row_offsets->data().get(),
1616                         loTriFactor->csrMat->column_indices->data().get(),
1617                         loTriFactor->solveInfo,
1618                         barray, tempGPU->data().get()
1619                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1621                       #endif
1622 );CHKERRCUSPARSE(stat);
1623 
1624   /* Next, solve U */
1625   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1626                         upTriFactor->csrMat->num_rows,
1627                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628                         upTriFactor->csrMat->num_entries,
1629                       #endif
1630                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1631                         upTriFactor->csrMat->values->data().get(),
1632                         upTriFactor->csrMat->row_offsets->data().get(),
1633                         upTriFactor->csrMat->column_indices->data().get(),
1634                         upTriFactor->solveInfo,
1635                         tempGPU->data().get(), xarray
1636                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1638                       #endif
1639 );CHKERRCUSPARSE(stat);
1640 
1641   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1642   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1643   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1644   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1646   PetscFunctionReturn(0);
1647 }
1648 
1649 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1650 {
1651   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1652   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1653   cudaError_t        cerr;
1654   PetscErrorCode     ierr;
1655 
1656   PetscFunctionBegin;
1657   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1658     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1659 
1660     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1661     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1662     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1663     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1664     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1665     A->offloadmask = PETSC_OFFLOAD_BOTH;
1666   }
1667   PetscFunctionReturn(0);
1668 }
1669 
1670 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1671 {
1672   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1673   PetscErrorCode ierr;
1674 
1675   PetscFunctionBegin;
1676   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1677   *array = a->a;
1678   A->offloadmask = PETSC_OFFLOAD_CPU;
1679   PetscFunctionReturn(0);
1680 }
1681 
1682 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1683 {
1684   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1685   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1686   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1687   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1688   PetscErrorCode               ierr;
1689   cusparseStatus_t             stat;
1690   PetscBool                    both = PETSC_TRUE;
1691   cudaError_t                  err;
1692 
1693   PetscFunctionBegin;
1694   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1695   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1696     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1697       /* Copy values only */
1698       CsrMatrix *matrix,*matrixT;
1699       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1700 
1701       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1702       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1703       matrix->values->assign(a->a, a->a+a->nz);
1704       err  = WaitForCUDA();CHKERRCUDA(err);
1705       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1706       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1707 
1708       /* Update matT when it was built before */
1709       if (cusparsestruct->matTranspose) {
1710         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1711         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1712         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1713         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1714                             A->cmap->n, matrix->num_entries,
1715                             matrix->values->data().get(),
1716                             cusparsestruct->rowoffsets_gpu->data().get(),
1717                             matrix->column_indices->data().get(),
1718                             matrixT->values->data().get(),
1719                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1720                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1721                             CUSPARSE_ACTION_NUMERIC,indexBase,
1722                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1723                           #else
1724                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1725                             CUSPARSE_ACTION_NUMERIC, indexBase
1726                           #endif
1727 );CHKERRCUSPARSE(stat);
1728         err  = WaitForCUDA();CHKERRCUDA(err);
1729         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1730       }
1731     } else {
1732       PetscInt nnz;
1733       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1734       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1735       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1736       delete cusparsestruct->workVector;
1737       delete cusparsestruct->rowoffsets_gpu;
1738       try {
1739         if (a->compressedrow.use) {
1740           m    = a->compressedrow.nrows;
1741           ii   = a->compressedrow.i;
1742           ridx = a->compressedrow.rindex;
1743         } else {
1744           m    = A->rmap->n;
1745           ii   = a->i;
1746           ridx = NULL;
1747         }
1748         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1749         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1750         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1751         else nnz = a->nz;
1752 
1753         /* create cusparse matrix */
1754         cusparsestruct->nrows = m;
1755         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1756         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1757         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1758         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1759 
1760         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1761         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1762         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1763         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1764         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1765         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1766         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1767 
1768         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1769         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1770           /* set the matrix */
1771           CsrMatrix *mat= new CsrMatrix;
1772           mat->num_rows = m;
1773           mat->num_cols = A->cmap->n;
1774           mat->num_entries = nnz;
1775           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1776           mat->row_offsets->assign(ii, ii + m+1);
1777 
1778           mat->column_indices = new THRUSTINTARRAY32(nnz);
1779           mat->column_indices->assign(a->j, a->j+nnz);
1780 
1781           mat->values = new THRUSTARRAY(nnz);
1782           if (a->a) mat->values->assign(a->a, a->a+nnz);
1783 
1784           /* assign the pointer */
1785           matstruct->mat = mat;
1786          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1787           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1788             stat = cusparseCreateCsr(&matstruct->matDescr,
1789                                     mat->num_rows, mat->num_cols, mat->num_entries,
1790                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1791                                     mat->values->data().get(),
1792                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1793                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1794           }
1795          #endif
1796         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1797          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1798           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1799          #else
1800           CsrMatrix *mat= new CsrMatrix;
1801           mat->num_rows = m;
1802           mat->num_cols = A->cmap->n;
1803           mat->num_entries = nnz;
1804           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1805           mat->row_offsets->assign(ii, ii + m+1);
1806 
1807           mat->column_indices = new THRUSTINTARRAY32(nnz);
1808           mat->column_indices->assign(a->j, a->j+nnz);
1809 
1810           mat->values = new THRUSTARRAY(nnz);
1811           if (a->a) mat->values->assign(a->a, a->a+nnz);
1812 
1813           cusparseHybMat_t hybMat;
1814           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1815           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1816             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1817           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1818               matstruct->descr, mat->values->data().get(),
1819               mat->row_offsets->data().get(),
1820               mat->column_indices->data().get(),
1821               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1822           /* assign the pointer */
1823           matstruct->mat = hybMat;
1824 
1825           if (mat) {
1826             if (mat->values) delete (THRUSTARRAY*)mat->values;
1827             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1828             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1829             delete (CsrMatrix*)mat;
1830           }
1831          #endif
1832         }
1833 
1834         /* assign the compressed row indices */
1835         if (a->compressedrow.use) {
1836           cusparsestruct->workVector = new THRUSTARRAY(m);
1837           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1838           matstruct->cprowIndices->assign(ridx,ridx+m);
1839           tmp = m;
1840         } else {
1841           cusparsestruct->workVector = NULL;
1842           matstruct->cprowIndices    = NULL;
1843           tmp = 0;
1844         }
1845         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1846 
1847         /* assign the pointer */
1848         cusparsestruct->mat = matstruct;
1849       } catch(char *ex) {
1850         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1851       }
1852       err  = WaitForCUDA();CHKERRCUDA(err);
1853       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1854       cusparsestruct->nonzerostate = A->nonzerostate;
1855     }
1856     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1857   }
1858   PetscFunctionReturn(0);
1859 }
1860 
1861 struct VecCUDAPlusEquals
1862 {
1863   template <typename Tuple>
1864   __host__ __device__
1865   void operator()(Tuple t)
1866   {
1867     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1868   }
1869 };
1870 
1871 struct VecCUDAEquals
1872 {
1873   template <typename Tuple>
1874   __host__ __device__
1875   void operator()(Tuple t)
1876   {
1877     thrust::get<1>(t) = thrust::get<0>(t);
1878   }
1879 };
1880 
1881 struct VecCUDAEqualsReverse
1882 {
1883   template <typename Tuple>
1884   __host__ __device__
1885   void operator()(Tuple t)
1886   {
1887     thrust::get<0>(t) = thrust::get<1>(t);
1888   }
1889 };
1890 
1891 struct MatMatCusparse {
1892   PetscBool             cisdense;
1893   PetscScalar           *Bt;
1894   Mat                   X;
1895   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1896   PetscLogDouble        flops;
1897   CsrMatrix             *Bcsr;
1898 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1899   cusparseSpMatDescr_t  matSpBDescr;
1900   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1901   cusparseDnMatDescr_t  matBDescr;
1902   cusparseDnMatDescr_t  matCDescr;
1903   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1904   size_t                mmBufferSize;
1905   void                  *mmBuffer;
1906   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1907   cusparseSpGEMMDescr_t spgemmDesc;
1908 #endif
1909 };
1910 
1911 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1912 {
1913   PetscErrorCode   ierr;
1914   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1915   cudaError_t      cerr;
1916  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917   cusparseStatus_t stat;
1918  #endif
1919 
1920   PetscFunctionBegin;
1921   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1922   delete mmdata->Bcsr;
1923  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1924   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1925   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1926   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1927   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1928   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1929   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1930  #endif
1931   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1932   ierr = PetscFree(data);CHKERRQ(ierr);
1933   PetscFunctionReturn(0);
1934 }
1935 
1936 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1937 
1938 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1939 {
1940   Mat_Product                  *product = C->product;
1941   Mat                          A,B;
1942   PetscInt                     m,n,blda,clda;
1943   PetscBool                    flg,biscuda;
1944   Mat_SeqAIJCUSPARSE           *cusp;
1945   cusparseStatus_t             stat;
1946   cusparseOperation_t          opA;
1947   const PetscScalar            *barray;
1948   PetscScalar                  *carray;
1949   PetscErrorCode               ierr;
1950   MatMatCusparse               *mmdata;
1951   Mat_SeqAIJCUSPARSEMultStruct *mat;
1952   CsrMatrix                    *csrmat;
1953   cudaError_t                  cerr;
1954 
1955   PetscFunctionBegin;
1956   MatCheckProduct(C,1);
1957   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1958   mmdata = (MatMatCusparse*)product->data;
1959   A    = product->A;
1960   B    = product->B;
1961   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1962   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1963   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1964      Instead of silently accepting the wrong answer, I prefer to raise the error */
1965   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1966   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1967   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1968   switch (product->type) {
1969   case MATPRODUCT_AB:
1970   case MATPRODUCT_PtAP:
1971     mat = cusp->mat;
1972     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1973     m   = A->rmap->n;
1974     n   = B->cmap->n;
1975     break;
1976   case MATPRODUCT_AtB:
1977     if (!cusp->transgen) {
1978       mat = cusp->mat;
1979       opA = CUSPARSE_OPERATION_TRANSPOSE;
1980     } else {
1981       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1982       mat  = cusp->matTranspose;
1983       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1984     }
1985     m = A->cmap->n;
1986     n = B->cmap->n;
1987     break;
1988   case MATPRODUCT_ABt:
1989   case MATPRODUCT_RARt:
1990     mat = cusp->mat;
1991     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1992     m   = A->rmap->n;
1993     n   = B->rmap->n;
1994     break;
1995   default:
1996     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1997   }
1998   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1999   csrmat = (CsrMatrix*)mat->mat;
2000   /* if the user passed a CPU matrix, copy the data to the GPU */
2001   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2002   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2003   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2004 
2005   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2006   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2007     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2008     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2009   } else {
2010     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2011     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2012   }
2013 
2014   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2015  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2016   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2017   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2018   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2019     size_t mmBufferSize;
2020     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2021     if (!mmdata->matBDescr) {
2022       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2023       mmdata->Blda = blda;
2024     }
2025 
2026     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2027     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2028       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2029       mmdata->Clda = clda;
2030     }
2031 
2032     if (!mat->matDescr) {
2033       stat = cusparseCreateCsr(&mat->matDescr,
2034                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2035                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2036                                csrmat->values->data().get(),
2037                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2038                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2039     }
2040     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2041                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2042                                    mmdata->matCDescr,cusparse_scalartype,
2043                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2044     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2045       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2046       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2047       mmdata->mmBufferSize = mmBufferSize;
2048     }
2049     mmdata->initialized = PETSC_TRUE;
2050   } else {
2051     /* to be safe, always update pointers of the mats */
2052     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2053     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2054     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2055   }
2056 
2057   /* do cusparseSpMM, which supports transpose on B */
2058   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2059                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2060                       mmdata->matCDescr,cusparse_scalartype,
2061                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2062  #else
2063   PetscInt k;
2064   /* cusparseXcsrmm does not support transpose on B */
2065   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2066     cublasHandle_t cublasv2handle;
2067     cublasStatus_t cerr;
2068 
2069     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2070     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2071                        B->cmap->n,B->rmap->n,
2072                        &PETSC_CUSPARSE_ONE ,barray,blda,
2073                        &PETSC_CUSPARSE_ZERO,barray,blda,
2074                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2075     blda = B->cmap->n;
2076     k    = B->cmap->n;
2077   } else {
2078     k    = B->rmap->n;
2079   }
2080 
2081   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2082   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2083                            csrmat->num_entries,mat->alpha_one,mat->descr,
2084                            csrmat->values->data().get(),
2085                            csrmat->row_offsets->data().get(),
2086                            csrmat->column_indices->data().get(),
2087                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2088                            carray,clda);CHKERRCUSPARSE(stat);
2089  #endif
2090   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2091   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2092   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2093   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2094   if (product->type == MATPRODUCT_RARt) {
2095     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2097   } else if (product->type == MATPRODUCT_PtAP) {
2098     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2099     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2100   } else {
2101     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2102   }
2103   if (mmdata->cisdense) {
2104     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2105   }
2106   if (!biscuda) {
2107     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2108   }
2109   PetscFunctionReturn(0);
2110 }
2111 
2112 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2113 {
2114   Mat_Product        *product = C->product;
2115   Mat                A,B;
2116   PetscInt           m,n;
2117   PetscBool          cisdense,flg;
2118   PetscErrorCode     ierr;
2119   MatMatCusparse     *mmdata;
2120   Mat_SeqAIJCUSPARSE *cusp;
2121 
2122   PetscFunctionBegin;
2123   MatCheckProduct(C,1);
2124   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2125   A    = product->A;
2126   B    = product->B;
2127   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2128   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2129   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2130   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2131   switch (product->type) {
2132   case MATPRODUCT_AB:
2133     m = A->rmap->n;
2134     n = B->cmap->n;
2135     break;
2136   case MATPRODUCT_AtB:
2137     m = A->cmap->n;
2138     n = B->cmap->n;
2139     break;
2140   case MATPRODUCT_ABt:
2141     m = A->rmap->n;
2142     n = B->rmap->n;
2143     break;
2144   case MATPRODUCT_PtAP:
2145     m = B->cmap->n;
2146     n = B->cmap->n;
2147     break;
2148   case MATPRODUCT_RARt:
2149     m = B->rmap->n;
2150     n = B->rmap->n;
2151     break;
2152   default:
2153     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2154   }
2155   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2156   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2157   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2158   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2159 
2160   /* product data */
2161   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2162   mmdata->cisdense = cisdense;
2163  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2164   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2165   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2166     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2167   }
2168  #endif
2169   /* for these products we need intermediate storage */
2170   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2171     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2172     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2173     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2174       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2175     } else {
2176       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2177     }
2178   }
2179   C->product->data    = mmdata;
2180   C->product->destroy = MatDestroy_MatMatCusparse;
2181 
2182   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2183   PetscFunctionReturn(0);
2184 }
2185 
2186 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2187 {
2188   Mat_Product                  *product = C->product;
2189   Mat                          A,B;
2190   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2191   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2192   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2193   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2194   PetscBool                    flg;
2195   PetscErrorCode               ierr;
2196   cusparseStatus_t             stat;
2197   cudaError_t                  cerr;
2198   MatProductType               ptype;
2199   MatMatCusparse               *mmdata;
2200 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201   cusparseSpMatDescr_t         BmatSpDescr;
2202 #endif
2203 
2204   PetscFunctionBegin;
2205   MatCheckProduct(C,1);
2206   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2207   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2208   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2209   mmdata = (MatMatCusparse*)C->product->data;
2210   A = product->A;
2211   B = product->B;
2212   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2213     mmdata->reusesym = PETSC_FALSE;
2214     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2215     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2216     Cmat = Ccusp->mat;
2217     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2218     Ccsr = (CsrMatrix*)Cmat->mat;
2219     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2220     goto finalize;
2221   }
2222   if (!c->nz) goto finalize;
2223   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2224   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2225   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2226   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2227   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2229   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2230   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2231   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2232   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2235   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2236   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2237 
2238   ptype = product->type;
2239   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2240   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2241   switch (ptype) {
2242   case MATPRODUCT_AB:
2243     Amat = Acusp->mat;
2244     Bmat = Bcusp->mat;
2245     break;
2246   case MATPRODUCT_AtB:
2247     Amat = Acusp->matTranspose;
2248     Bmat = Bcusp->mat;
2249     break;
2250   case MATPRODUCT_ABt:
2251     Amat = Acusp->mat;
2252     Bmat = Bcusp->matTranspose;
2253     break;
2254   default:
2255     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2256   }
2257   Cmat = Ccusp->mat;
2258   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2259   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2260   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2261   Acsr = (CsrMatrix*)Amat->mat;
2262   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2263   Ccsr = (CsrMatrix*)Cmat->mat;
2264   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2265   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2266   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2267   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2268 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2269   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2270   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2271                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2272                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2273                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2274   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2275                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2276                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2277 #else
2278   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2279                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2280                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2281                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2282                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2283 #endif
2284   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2285   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2286   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2287   C->offloadmask = PETSC_OFFLOAD_GPU;
2288 finalize:
2289   /* shorter version of MatAssemblyEnd_SeqAIJ */
2290   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2291   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2292   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2293   c->reallocs         = 0;
2294   C->info.mallocs    += 0;
2295   C->info.nz_unneeded = 0;
2296   C->assembled = C->was_assembled = PETSC_TRUE;
2297   C->num_ass++;
2298   PetscFunctionReturn(0);
2299 }
2300 
2301 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2302 {
2303   Mat_Product                  *product = C->product;
2304   Mat                          A,B;
2305   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2306   Mat_SeqAIJ                   *a,*b,*c;
2307   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2308   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2309   PetscInt                     i,j,m,n,k;
2310   PetscBool                    flg;
2311   PetscErrorCode               ierr;
2312   cusparseStatus_t             stat;
2313   cudaError_t                  cerr;
2314   MatProductType               ptype;
2315   MatMatCusparse               *mmdata;
2316   PetscLogDouble               flops;
2317   PetscBool                    biscompressed,ciscompressed;
2318 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2319   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2320   size_t                       bufSize2;
2321   cusparseSpMatDescr_t         BmatSpDescr;
2322 #else
2323   int                          cnz;
2324 #endif
2325 
2326   PetscFunctionBegin;
2327   MatCheckProduct(C,1);
2328   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2329   A    = product->A;
2330   B    = product->B;
2331   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2332   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2333   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2334   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2335   a = (Mat_SeqAIJ*)A->data;
2336   b = (Mat_SeqAIJ*)B->data;
2337   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2338   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2339   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2341 
2342   /* product data */
2343   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2344   C->product->data    = mmdata;
2345   C->product->destroy = MatDestroy_MatMatCusparse;
2346 
2347   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2348   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2349   ptype = product->type;
2350   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2351   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2352   biscompressed = PETSC_FALSE;
2353   ciscompressed = PETSC_FALSE;
2354   switch (ptype) {
2355   case MATPRODUCT_AB:
2356     m = A->rmap->n;
2357     n = B->cmap->n;
2358     k = A->cmap->n;
2359     Amat = Acusp->mat;
2360     Bmat = Bcusp->mat;
2361     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2362     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2363     break;
2364   case MATPRODUCT_AtB:
2365     m = A->cmap->n;
2366     n = B->cmap->n;
2367     k = A->rmap->n;
2368     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2369     Amat = Acusp->matTranspose;
2370     Bmat = Bcusp->mat;
2371     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2372     break;
2373   case MATPRODUCT_ABt:
2374     m = A->rmap->n;
2375     n = B->rmap->n;
2376     k = A->cmap->n;
2377     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2378     Amat = Acusp->mat;
2379     Bmat = Bcusp->matTranspose;
2380     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2381     break;
2382   default:
2383     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2384   }
2385 
2386   /* create cusparse matrix */
2387   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2388   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2389   c     = (Mat_SeqAIJ*)C->data;
2390   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2391   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2392   Ccsr  = new CsrMatrix;
2393 
2394   c->compressedrow.use = ciscompressed;
2395   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2396     c->compressedrow.nrows = a->compressedrow.nrows;
2397     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2398     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2399     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2400     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2401     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2402   } else {
2403     c->compressedrow.nrows  = 0;
2404     c->compressedrow.i      = NULL;
2405     c->compressedrow.rindex = NULL;
2406     Ccusp->workVector       = NULL;
2407     Cmat->cprowIndices      = NULL;
2408   }
2409   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2410   Ccusp->mat      = Cmat;
2411   Ccusp->mat->mat = Ccsr;
2412   Ccsr->num_rows    = Ccusp->nrows;
2413   Ccsr->num_cols    = n;
2414   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2415   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2416   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2417   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2418   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2420   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2421   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2424   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2425     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2426     c->nz = 0;
2427     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2428     Ccsr->values = new THRUSTARRAY(c->nz);
2429     goto finalizesym;
2430   }
2431 
2432   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434   Acsr = (CsrMatrix*)Amat->mat;
2435   if (!biscompressed) {
2436     Bcsr = (CsrMatrix*)Bmat->mat;
2437 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2438     BmatSpDescr = Bmat->matDescr;
2439 #endif
2440   } else { /* we need to use row offsets for the full matrix */
2441     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2442     Bcsr = new CsrMatrix;
2443     Bcsr->num_rows       = B->rmap->n;
2444     Bcsr->num_cols       = cBcsr->num_cols;
2445     Bcsr->num_entries    = cBcsr->num_entries;
2446     Bcsr->column_indices = cBcsr->column_indices;
2447     Bcsr->values         = cBcsr->values;
2448     if (!Bcusp->rowoffsets_gpu) {
2449       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2450       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2451       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2452     }
2453     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2454     mmdata->Bcsr = Bcsr;
2455 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2456     if (Bcsr->num_rows && Bcsr->num_cols) {
2457       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2458                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2459                                Bcsr->values->data().get(),
2460                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2461                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2462     }
2463     BmatSpDescr = mmdata->matSpBDescr;
2464 #endif
2465   }
2466   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2467   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2468   /* precompute flops count */
2469   if (ptype == MATPRODUCT_AB) {
2470     for (i=0, flops = 0; i<A->rmap->n; i++) {
2471       const PetscInt st = a->i[i];
2472       const PetscInt en = a->i[i+1];
2473       for (j=st; j<en; j++) {
2474         const PetscInt brow = a->j[j];
2475         flops += 2.*(b->i[brow+1] - b->i[brow]);
2476       }
2477     }
2478   } else if (ptype == MATPRODUCT_AtB) {
2479     for (i=0, flops = 0; i<A->rmap->n; i++) {
2480       const PetscInt anzi = a->i[i+1] - a->i[i];
2481       const PetscInt bnzi = b->i[i+1] - b->i[i];
2482       flops += (2.*anzi)*bnzi;
2483     }
2484   } else { /* TODO */
2485     flops = 0.;
2486   }
2487 
2488   mmdata->flops = flops;
2489   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2490 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2491   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2492   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2493                            NULL, NULL, NULL,
2494                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2495                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2496   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2497   /* ask bufferSize bytes for external memory */
2498   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2499                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2500                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2501                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2502   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2503   /* inspect the matrices A and B to understand the memory requirement for the next step */
2504   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2505                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2506                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2507                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2508   /* ask bufferSize again bytes for external memory */
2509   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2510                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2511                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2512                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2513   /* The CUSPARSE documentation is not clear, nor the API
2514      We need both buffers to perform the operations properly!
2515      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2516      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2517      is stored in the descriptor! What a messy API... */
2518   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2519   /* compute the intermediate product of A * B */
2520   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2521                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2522                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2523                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2524   /* get matrix C non-zero entries C_nnz1 */
2525   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2526   c->nz = (PetscInt) C_nnz1;
2527   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2528   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2529   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2530   Ccsr->values = new THRUSTARRAY(c->nz);
2531   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2532   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2533                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2534   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2535                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2536                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2537 #else
2538   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2539   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2540                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2541                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2542                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2543                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2544   c->nz = cnz;
2545   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2546   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2547   Ccsr->values = new THRUSTARRAY(c->nz);
2548   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2549 
2550   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2551   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2552      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2553      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2554   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2556                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2557                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2558                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2559 #endif
2560   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2561   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2562   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2563 finalizesym:
2564   c->singlemalloc = PETSC_FALSE;
2565   c->free_a       = PETSC_TRUE;
2566   c->free_ij      = PETSC_TRUE;
2567   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2568   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2569   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2570     PetscInt *d_i = c->i;
2571     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2572     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2573     ii   = *Ccsr->row_offsets;
2574     jj   = *Ccsr->column_indices;
2575     if (ciscompressed) d_i = c->compressedrow.i;
2576     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2577     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578   } else {
2579     PetscInt *d_i = c->i;
2580     if (ciscompressed) d_i = c->compressedrow.i;
2581     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2582     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2583   }
2584   if (ciscompressed) { /* need to expand host row offsets */
2585     PetscInt r = 0;
2586     c->i[0] = 0;
2587     for (k = 0; k < c->compressedrow.nrows; k++) {
2588       const PetscInt next = c->compressedrow.rindex[k];
2589       const PetscInt old = c->compressedrow.i[k];
2590       for (; r < next; r++) c->i[r+1] = old;
2591     }
2592     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2593   }
2594   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2595   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2596   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2597   c->maxnz = c->nz;
2598   c->nonzerorowcnt = 0;
2599   c->rmax = 0;
2600   for (k = 0; k < m; k++) {
2601     const PetscInt nn = c->i[k+1] - c->i[k];
2602     c->ilen[k] = c->imax[k] = nn;
2603     c->nonzerorowcnt += (PetscInt)!!nn;
2604     c->rmax = PetscMax(c->rmax,nn);
2605   }
2606   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2607   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2608   Ccsr->num_entries = c->nz;
2609 
2610   C->nonzerostate++;
2611   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2612   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2613   Ccusp->nonzerostate = C->nonzerostate;
2614   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2615   C->preallocated  = PETSC_TRUE;
2616   C->assembled     = PETSC_FALSE;
2617   C->was_assembled = PETSC_FALSE;
2618   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2619     mmdata->reusesym = PETSC_TRUE;
2620     C->offloadmask   = PETSC_OFFLOAD_GPU;
2621   }
2622   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2623   PetscFunctionReturn(0);
2624 }
2625 
2626 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2627 
2628 /* handles sparse or dense B */
2629 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2630 {
2631   Mat_Product    *product = mat->product;
2632   PetscErrorCode ierr;
2633   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2634 
2635   PetscFunctionBegin;
2636   MatCheckProduct(mat,1);
2637   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2638   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2639     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2640   }
2641   if (product->type == MATPRODUCT_ABC) {
2642     Ciscusp = PETSC_FALSE;
2643     if (!product->C->boundtocpu) {
2644       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2645     }
2646   }
2647   if (isdense) {
2648     switch (product->type) {
2649     case MATPRODUCT_AB:
2650     case MATPRODUCT_AtB:
2651     case MATPRODUCT_ABt:
2652     case MATPRODUCT_PtAP:
2653     case MATPRODUCT_RARt:
2654      if (product->A->boundtocpu) {
2655         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2656       } else {
2657         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2658       }
2659       break;
2660     case MATPRODUCT_ABC:
2661       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2662       break;
2663     default:
2664       break;
2665     }
2666   } else if (Biscusp && Ciscusp) {
2667     switch (product->type) {
2668     case MATPRODUCT_AB:
2669     case MATPRODUCT_AtB:
2670     case MATPRODUCT_ABt:
2671       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2672       break;
2673     case MATPRODUCT_PtAP:
2674     case MATPRODUCT_RARt:
2675     case MATPRODUCT_ABC:
2676       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2677       break;
2678     default:
2679       break;
2680     }
2681   } else { /* fallback for AIJ */
2682     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2683   }
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2688 {
2689   PetscErrorCode ierr;
2690 
2691   PetscFunctionBegin;
2692   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2693   PetscFunctionReturn(0);
2694 }
2695 
2696 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2697 {
2698   PetscErrorCode ierr;
2699 
2700   PetscFunctionBegin;
2701   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2702   PetscFunctionReturn(0);
2703 }
2704 
2705 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2706 {
2707   PetscErrorCode ierr;
2708 
2709   PetscFunctionBegin;
2710   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2715 {
2716   PetscErrorCode ierr;
2717 
2718   PetscFunctionBegin;
2719   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2720   PetscFunctionReturn(0);
2721 }
2722 
2723 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2724 {
2725   PetscErrorCode ierr;
2726 
2727   PetscFunctionBegin;
2728   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2729   PetscFunctionReturn(0);
2730 }
2731 
2732 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2733 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2734 {
2735   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2736   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2737   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2738   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2739   PetscErrorCode               ierr;
2740   cudaError_t                  cerr;
2741   cusparseStatus_t             stat;
2742   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2743   PetscBool                    compressed;
2744 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2745   PetscInt                     nx,ny;
2746 #endif
2747 
2748   PetscFunctionBegin;
2749   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2750   if (!a->nonzerorowcnt) {
2751     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2752     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2753     PetscFunctionReturn(0);
2754   }
2755   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2756   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2757   if (!trans) {
2758     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2759     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2760   } else {
2761     if (herm || !cusparsestruct->transgen) {
2762       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2763       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2764     } else {
2765       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2766       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2767     }
2768   }
2769   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2770   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2771 
2772   try {
2773     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2774     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2775     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2776 
2777     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2778     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2779       /* z = A x + beta y.
2780          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2781          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2782       */
2783       xptr = xarray;
2784       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2785       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2786      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2787       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2788           allocated to accommodate different uses. So we get the length info directly from mat.
2789        */
2790       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2791         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2792         nx = mat->num_cols;
2793         ny = mat->num_rows;
2794       }
2795      #endif
2796     } else {
2797       /* z = A^T x + beta y
2798          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2799          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2800        */
2801       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2802       dptr = zarray;
2803       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2804       if (compressed) { /* Scatter x to work vector */
2805         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2806         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2807                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2808                          VecCUDAEqualsReverse());
2809       }
2810      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2811       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2812         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2813         nx = mat->num_rows;
2814         ny = mat->num_cols;
2815       }
2816      #endif
2817     }
2818 
2819     /* csr_spmv does y = alpha op(A) x + beta y */
2820     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2821      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2822       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2823       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2824         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2825         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2826         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2827                                 matstruct->matDescr,
2828                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2829                                 matstruct->cuSpMV[opA].vecYDescr,
2830                                 cusparse_scalartype,
2831                                 cusparsestruct->spmvAlg,
2832                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2833         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2834 
2835         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2836       } else {
2837         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2838         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2839         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2840       }
2841 
2842       stat = cusparseSpMV(cusparsestruct->handle, opA,
2843                                matstruct->alpha_one,
2844                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2845                                matstruct->cuSpMV[opA].vecXDescr,
2846                                beta,
2847                                matstruct->cuSpMV[opA].vecYDescr,
2848                                cusparse_scalartype,
2849                                cusparsestruct->spmvAlg,
2850                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2851      #else
2852       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2853       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2854                                mat->num_rows, mat->num_cols,
2855                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2856                                mat->values->data().get(), mat->row_offsets->data().get(),
2857                                mat->column_indices->data().get(), xptr, beta,
2858                                dptr);CHKERRCUSPARSE(stat);
2859      #endif
2860     } else {
2861       if (cusparsestruct->nrows) {
2862        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2863         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2864        #else
2865         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2866         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2867                                  matstruct->alpha_one, matstruct->descr, hybMat,
2868                                  xptr, beta,
2869                                  dptr);CHKERRCUSPARSE(stat);
2870        #endif
2871       }
2872     }
2873     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2874     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2875 
2876     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2877       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2878         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2879           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2880         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2881           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2882         }
2883       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2884         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2885       }
2886 
2887       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2888       if (compressed) {
2889         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2890         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2891         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2892                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2893                          VecCUDAPlusEquals());
2894         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2895         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2896       }
2897     } else {
2898       if (yy && yy != zz) {
2899         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2900       }
2901     }
2902     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2903     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2904     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2905   } catch(char *ex) {
2906     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2907   }
2908   if (yy) {
2909     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2910   } else {
2911     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2912   }
2913   PetscFunctionReturn(0);
2914 }
2915 
2916 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2917 {
2918   PetscErrorCode ierr;
2919 
2920   PetscFunctionBegin;
2921   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2922   PetscFunctionReturn(0);
2923 }
2924 
2925 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2926 {
2927   PetscErrorCode              ierr;
2928   PetscSplitCSRDataStructure  *d_mat = NULL;
2929   PetscFunctionBegin;
2930   if (A->factortype == MAT_FACTOR_NONE) {
2931     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2932   }
2933   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2934   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2935   if (d_mat) {
2936     A->offloadmask = PETSC_OFFLOAD_GPU;
2937   }
2938 
2939   PetscFunctionReturn(0);
2940 }
2941 
2942 /* --------------------------------------------------------------------------------*/
2943 /*@
2944    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2945    (the default parallel PETSc format). This matrix will ultimately pushed down
2946    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2947    assembly performance the user should preallocate the matrix storage by setting
2948    the parameter nz (or the array nnz).  By setting these parameters accurately,
2949    performance during matrix assembly can be increased by more than a factor of 50.
2950 
2951    Collective
2952 
2953    Input Parameters:
2954 +  comm - MPI communicator, set to PETSC_COMM_SELF
2955 .  m - number of rows
2956 .  n - number of columns
2957 .  nz - number of nonzeros per row (same for all rows)
2958 -  nnz - array containing the number of nonzeros in the various rows
2959          (possibly different for each row) or NULL
2960 
2961    Output Parameter:
2962 .  A - the matrix
2963 
2964    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2965    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2966    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2967 
2968    Notes:
2969    If nnz is given then nz is ignored
2970 
2971    The AIJ format (also called the Yale sparse matrix format or
2972    compressed row storage), is fully compatible with standard Fortran 77
2973    storage.  That is, the stored row and column indices can begin at
2974    either one (as in Fortran) or zero.  See the users' manual for details.
2975 
2976    Specify the preallocated storage with either nz or nnz (not both).
2977    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2978    allocation.  For large problems you MUST preallocate memory or you
2979    will get TERRIBLE performance, see the users' manual chapter on matrices.
2980 
2981    By default, this format uses inodes (identical nodes) when possible, to
2982    improve numerical efficiency of matrix-vector products and solves. We
2983    search for consecutive rows with the same nonzero structure, thereby
2984    reusing matrix information to achieve increased efficiency.
2985 
2986    Level: intermediate
2987 
2988 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2989 @*/
2990 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2991 {
2992   PetscErrorCode ierr;
2993 
2994   PetscFunctionBegin;
2995   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2996   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2997   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2998   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2999   PetscFunctionReturn(0);
3000 }
3001 
3002 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3003 {
3004   PetscErrorCode              ierr;
3005   PetscSplitCSRDataStructure  *d_mat = NULL;
3006 
3007   PetscFunctionBegin;
3008   if (A->factortype == MAT_FACTOR_NONE) {
3009     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3010     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3011     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3012   } else {
3013     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3014   }
3015   if (d_mat) {
3016     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3017     cudaError_t                err;
3018     PetscSplitCSRDataStructure h_mat;
3019     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3020     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3021     if (a->compressedrow.use) {
3022       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3023     }
3024     err = cudaFree(d_mat);CHKERRCUDA(err);
3025   }
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3029   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3030   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3031   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3032   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3033   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3034   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3039 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3040 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3041 {
3042   PetscErrorCode ierr;
3043 
3044   PetscFunctionBegin;
3045   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3046   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3047   PetscFunctionReturn(0);
3048 }
3049 
3050 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3051 {
3052   PetscErrorCode     ierr;
3053   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3054   Mat_SeqAIJCUSPARSE *cy;
3055   Mat_SeqAIJCUSPARSE *cx;
3056   PetscScalar        *ay;
3057   const PetscScalar  *ax;
3058   CsrMatrix          *csry,*csrx;
3059   cudaError_t        cerr;
3060 
3061   PetscFunctionBegin;
3062   if (X->ops->axpy != Y->ops->axpy) {
3063     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3064     PetscFunctionReturn(0);
3065   }
3066   /* if we are here, it means both matrices are bound to GPU */
3067   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3068   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3069   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3070   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3071   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3073   csry = (CsrMatrix*)cy->mat->mat;
3074   csrx = (CsrMatrix*)cx->mat->mat;
3075   /* see if we can turn this into a cublas axpy */
3076   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3077     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3078     if (eq) {
3079       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3080     }
3081     if (eq) str = SAME_NONZERO_PATTERN;
3082   }
3083   /* spgeam is buggy with one column */
3084   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3085 
3086   if (str == SUBSET_NONZERO_PATTERN) {
3087     cusparseStatus_t stat;
3088     PetscScalar      b = 1.0;
3089 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3090     size_t           bufferSize;
3091     void             *buffer;
3092 #endif
3093 
3094     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3095     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3096     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3097 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3098     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3099                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3100                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3101                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3102     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3103     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3104     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3105                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3106                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3107                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3108     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3109     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3110     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3111     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3112 #else
3113     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3114     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3115                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3116                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3117                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3118     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3119     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3120     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3121 #endif
3122     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3123     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3124     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3125     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3126   } else if (str == SAME_NONZERO_PATTERN) {
3127     cublasHandle_t cublasv2handle;
3128     cublasStatus_t berr;
3129     PetscBLASInt   one = 1, bnz = 1;
3130 
3131     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3132     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3133     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3134     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3135     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3136     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3137     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3138     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3139     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3140     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3141     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3142     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3143   } else {
3144     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3145   }
3146   PetscFunctionReturn(0);
3147 }
3148 
3149 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
3150 {
3151   PetscErrorCode ierr;
3152   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
3153   PetscScalar    *ay;
3154   cudaError_t    cerr;
3155   cublasHandle_t cublasv2handle;
3156   cublasStatus_t berr;
3157   PetscBLASInt   one = 1, bnz = 1;
3158 
3159   PetscFunctionBegin;
3160   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3161   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3162   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
3163   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3164   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
3165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3166   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
3167   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3168   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3169   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3170   PetscFunctionReturn(0);
3171 }
3172 
3173 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3174 {
3175   PetscErrorCode             ierr;
3176   PetscBool                  both = PETSC_FALSE;
3177   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3178 
3179   PetscFunctionBegin;
3180   if (A->factortype == MAT_FACTOR_NONE) {
3181     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3182     if (spptr->mat) {
3183       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3184       if (matrix->values) {
3185         both = PETSC_TRUE;
3186         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3187       }
3188     }
3189     if (spptr->matTranspose) {
3190       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3191       if (matrix->values) {
3192         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3193       }
3194     }
3195   }
3196   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3197   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3198   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3199   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3200   else A->offloadmask = PETSC_OFFLOAD_CPU;
3201 
3202   PetscFunctionReturn(0);
3203 }
3204 
3205 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3206 {
3207   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3208   PetscErrorCode ierr;
3209 
3210   PetscFunctionBegin;
3211   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3212   if (flg) {
3213     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3214 
3215     A->ops->scale                     = MatScale_SeqAIJ;
3216     A->ops->axpy                      = MatAXPY_SeqAIJ;
3217     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3218     A->ops->mult                      = MatMult_SeqAIJ;
3219     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3220     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3221     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3222     A->ops->multhermitiantranspose    = NULL;
3223     A->ops->multhermitiantransposeadd = NULL;
3224     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3225     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3226     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3227     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3228     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3229     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3230     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3231     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3232   } else {
3233     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3234     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3235     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3236     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3237     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3238     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3239     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3240     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3241     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3242     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3243     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3244     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3245     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3246     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3247     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3248     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3249     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3250   }
3251   A->boundtocpu = flg;
3252   a->inode.use = flg;
3253   PetscFunctionReturn(0);
3254 }
3255 
3256 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3257 {
3258   PetscErrorCode   ierr;
3259   cusparseStatus_t stat;
3260   Mat              B;
3261 
3262   PetscFunctionBegin;
3263   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3264   if (reuse == MAT_INITIAL_MATRIX) {
3265     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3266   } else if (reuse == MAT_REUSE_MATRIX) {
3267     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3268   }
3269   B = *newmat;
3270 
3271   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3272   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3273 
3274   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3275     if (B->factortype == MAT_FACTOR_NONE) {
3276       Mat_SeqAIJCUSPARSE *spptr;
3277 
3278       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3279       spptr->format = MAT_CUSPARSE_CSR;
3280       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3281       B->spptr = spptr;
3282       spptr->deviceMat = NULL;
3283     } else {
3284       Mat_SeqAIJCUSPARSETriFactors *spptr;
3285 
3286       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3287       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3288       B->spptr = spptr;
3289     }
3290     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3291   }
3292   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3293   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3294   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3295   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3296   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3297 
3298   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3299   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3300   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3301   PetscFunctionReturn(0);
3302 }
3303 
3304 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3305 {
3306   PetscErrorCode ierr;
3307 
3308   PetscFunctionBegin;
3309   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3310   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3311   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3312   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3313   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3314   PetscFunctionReturn(0);
3315 }
3316 
3317 /*MC
3318    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3319 
3320    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3321    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3322    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3323 
3324    Options Database Keys:
3325 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3326 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3327 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3328 
3329   Level: beginner
3330 
3331 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3332 M*/
3333 
3334 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3335 
3336 
3337 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3338 {
3339   PetscErrorCode ierr;
3340 
3341   PetscFunctionBegin;
3342   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3343   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3344   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3345   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3346   PetscFunctionReturn(0);
3347 }
3348 
3349 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3350 {
3351   PetscErrorCode   ierr;
3352   cusparseStatus_t stat;
3353 
3354   PetscFunctionBegin;
3355   if (*cusparsestruct) {
3356     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3357     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3358     delete (*cusparsestruct)->workVector;
3359     delete (*cusparsestruct)->rowoffsets_gpu;
3360     delete (*cusparsestruct)->cooPerm;
3361     delete (*cusparsestruct)->cooPerm_a;
3362     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3363    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3364     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3365    #endif
3366     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3367   }
3368   PetscFunctionReturn(0);
3369 }
3370 
3371 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3372 {
3373   PetscFunctionBegin;
3374   if (*mat) {
3375     delete (*mat)->values;
3376     delete (*mat)->column_indices;
3377     delete (*mat)->row_offsets;
3378     delete *mat;
3379     *mat = 0;
3380   }
3381   PetscFunctionReturn(0);
3382 }
3383 
3384 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3385 {
3386   cusparseStatus_t stat;
3387   PetscErrorCode   ierr;
3388 
3389   PetscFunctionBegin;
3390   if (*trifactor) {
3391     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3392     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3393     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3394     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3395     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3396    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3397     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3398    #endif
3399     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3400   }
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3405 {
3406   CsrMatrix        *mat;
3407   cusparseStatus_t stat;
3408   cudaError_t      err;
3409 
3410   PetscFunctionBegin;
3411   if (*matstruct) {
3412     if ((*matstruct)->mat) {
3413       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3414        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3415         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3416        #else
3417         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3418         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3419        #endif
3420       } else {
3421         mat = (CsrMatrix*)(*matstruct)->mat;
3422         CsrMatrix_Destroy(&mat);
3423       }
3424     }
3425     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3426     delete (*matstruct)->cprowIndices;
3427     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3428     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3429     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3430 
3431    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3432     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3433     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3434     for (int i=0; i<3; i++) {
3435       if (mdata->cuSpMV[i].initialized) {
3436         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3437         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3438         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3439       }
3440     }
3441    #endif
3442     delete *matstruct;
3443     *matstruct = NULL;
3444   }
3445   PetscFunctionReturn(0);
3446 }
3447 
3448 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3449 {
3450   PetscErrorCode ierr;
3451 
3452   PetscFunctionBegin;
3453   if (*trifactors) {
3454     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3455     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3456     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3457     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3458     delete (*trifactors)->rpermIndices;
3459     delete (*trifactors)->cpermIndices;
3460     delete (*trifactors)->workVector;
3461     (*trifactors)->rpermIndices = NULL;
3462     (*trifactors)->cpermIndices = NULL;
3463     (*trifactors)->workVector = NULL;
3464   }
3465   PetscFunctionReturn(0);
3466 }
3467 
3468 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3469 {
3470   PetscErrorCode   ierr;
3471   cusparseHandle_t handle;
3472   cusparseStatus_t stat;
3473 
3474   PetscFunctionBegin;
3475   if (*trifactors) {
3476     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3477     if (handle = (*trifactors)->handle) {
3478       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3479     }
3480     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3481   }
3482   PetscFunctionReturn(0);
3483 }
3484 
3485 struct IJCompare
3486 {
3487   __host__ __device__
3488   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3489   {
3490     if (t1.get<0>() < t2.get<0>()) return true;
3491     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3492     return false;
3493   }
3494 };
3495 
3496 struct IJEqual
3497 {
3498   __host__ __device__
3499   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3500   {
3501     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3502     return true;
3503   }
3504 };
3505 
3506 struct IJDiff
3507 {
3508   __host__ __device__
3509   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3510   {
3511     return t1 == t2 ? 0 : 1;
3512   }
3513 };
3514 
3515 struct IJSum
3516 {
3517   __host__ __device__
3518   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3519   {
3520     return t1||t2;
3521   }
3522 };
3523 
3524 #include <thrust/iterator/discard_iterator.h>
3525 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3526 {
3527   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3528   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3529   THRUSTARRAY                           *cooPerm_v = NULL;
3530   thrust::device_ptr<const PetscScalar> d_v;
3531   CsrMatrix                             *matrix;
3532   PetscErrorCode                        ierr;
3533   cudaError_t                           cerr;
3534   PetscInt                              n;
3535 
3536   PetscFunctionBegin;
3537   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3538   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3539   if (!cusp->cooPerm) {
3540     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3541     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3542     PetscFunctionReturn(0);
3543   }
3544   matrix = (CsrMatrix*)cusp->mat->mat;
3545   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3546   if (!v) {
3547     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3548     goto finalize;
3549   }
3550   n = cusp->cooPerm->size();
3551   if (isCudaMem(v)) {
3552     d_v = thrust::device_pointer_cast(v);
3553   } else {
3554     cooPerm_v = new THRUSTARRAY(n);
3555     cooPerm_v->assign(v,v+n);
3556     d_v = cooPerm_v->data();
3557     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3558   }
3559   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3560   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3561     if (cusp->cooPerm_a) {
3562       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3563       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3564       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3565       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3566       delete cooPerm_w;
3567     } else {
3568       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3569                                                                 matrix->values->begin()));
3570       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3571                                                                 matrix->values->end()));
3572       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3573     }
3574   } else {
3575     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3576       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3577       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3578     } else {
3579       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3580                                                                 matrix->values->begin()));
3581       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3582                                                                 matrix->values->end()));
3583       thrust::for_each(zibit,zieit,VecCUDAEquals());
3584     }
3585   }
3586   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3587   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3588 finalize:
3589   delete cooPerm_v;
3590   A->offloadmask = PETSC_OFFLOAD_GPU;
3591   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3592   /* shorter version of MatAssemblyEnd_SeqAIJ */
3593   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3594   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3595   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3596   a->reallocs         = 0;
3597   A->info.mallocs    += 0;
3598   A->info.nz_unneeded = 0;
3599   A->assembled = A->was_assembled = PETSC_TRUE;
3600   A->num_ass++;
3601   PetscFunctionReturn(0);
3602 }
3603 
3604 #include <thrust/binary_search.h>
3605 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3606 {
3607   PetscErrorCode     ierr;
3608   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3609   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3610   PetscInt           cooPerm_n, nzr = 0;
3611   cudaError_t        cerr;
3612 
3613   PetscFunctionBegin;
3614   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3615   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3616   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3617   if (n != cooPerm_n) {
3618     delete cusp->cooPerm;
3619     delete cusp->cooPerm_a;
3620     cusp->cooPerm = NULL;
3621     cusp->cooPerm_a = NULL;
3622   }
3623   if (n) {
3624     THRUSTINTARRAY d_i(n);
3625     THRUSTINTARRAY d_j(n);
3626     THRUSTINTARRAY ii(A->rmap->n);
3627 
3628     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3629     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3630 
3631     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3632     d_i.assign(coo_i,coo_i+n);
3633     d_j.assign(coo_j,coo_j+n);
3634     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3635     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3636 
3637     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3638     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3639     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3640     *cusp->cooPerm_a = d_i;
3641     THRUSTINTARRAY w = d_j;
3642 
3643     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3644     if (nekey == ekey) { /* all entries are unique */
3645       delete cusp->cooPerm_a;
3646       cusp->cooPerm_a = NULL;
3647     } else { /* I couldn't come up with a more elegant algorithm */
3648       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3649       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3650       (*cusp->cooPerm_a)[0] = 0;
3651       w[0] = 0;
3652       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3653       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3654     }
3655     thrust::counting_iterator<PetscInt> search_begin(0);
3656     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3657                         search_begin, search_begin + A->rmap->n,
3658                         ii.begin());
3659     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3660     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3661 
3662     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3663     a->singlemalloc = PETSC_FALSE;
3664     a->free_a       = PETSC_TRUE;
3665     a->free_ij      = PETSC_TRUE;
3666     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3667     a->i[0] = 0;
3668     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3669     a->nz = a->maxnz = a->i[A->rmap->n];
3670     a->rmax = 0;
3671     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3672     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3673     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3674     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3675     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3676     for (PetscInt i = 0; i < A->rmap->n; i++) {
3677       const PetscInt nnzr = a->i[i+1] - a->i[i];
3678       nzr += (PetscInt)!!(nnzr);
3679       a->ilen[i] = a->imax[i] = nnzr;
3680       a->rmax = PetscMax(a->rmax,nnzr);
3681     }
3682     a->nonzerorowcnt = nzr;
3683     A->preallocated = PETSC_TRUE;
3684     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3685     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3686   } else {
3687     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3688   }
3689   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3690 
3691   /* We want to allocate the CUSPARSE struct for matvec now.
3692      The code is so convoluted now that I prefer to copy zeros */
3693   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3694   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3695   A->offloadmask = PETSC_OFFLOAD_CPU;
3696   A->nonzerostate++;
3697   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3698   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3699 
3700   A->assembled = PETSC_FALSE;
3701   A->was_assembled = PETSC_FALSE;
3702   PetscFunctionReturn(0);
3703 }
3704 
3705 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3706 {
3707   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3708   CsrMatrix          *csr;
3709   PetscErrorCode     ierr;
3710 
3711   PetscFunctionBegin;
3712   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3713   PetscValidPointer(a,2);
3714   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3715   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3716   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3717   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3718   csr = (CsrMatrix*)cusp->mat->mat;
3719   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3720   *a = csr->values->data().get();
3721   PetscFunctionReturn(0);
3722 }
3723 
3724 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3725 {
3726   PetscFunctionBegin;
3727   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3728   PetscValidPointer(a,2);
3729   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3730   *a = NULL;
3731   PetscFunctionReturn(0);
3732 }
3733 
3734 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3735 {
3736   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3737   CsrMatrix          *csr;
3738   PetscErrorCode     ierr;
3739 
3740   PetscFunctionBegin;
3741   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3742   PetscValidPointer(a,2);
3743   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3744   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3745   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3746   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3747   csr = (CsrMatrix*)cusp->mat->mat;
3748   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3749   *a = csr->values->data().get();
3750   A->offloadmask = PETSC_OFFLOAD_GPU;
3751   PetscFunctionReturn(0);
3752 }
3753 
3754 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3755 {
3756   PetscErrorCode ierr;
3757 
3758   PetscFunctionBegin;
3759   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3760   PetscValidPointer(a,2);
3761   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3762   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3763   *a = NULL;
3764   PetscFunctionReturn(0);
3765 }
3766 
3767 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3768 {
3769   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3770   CsrMatrix          *csr;
3771 
3772   PetscFunctionBegin;
3773   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3774   PetscValidPointer(a,2);
3775   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3776   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3777   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3778   csr = (CsrMatrix*)cusp->mat->mat;
3779   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780   *a = csr->values->data().get();
3781   A->offloadmask = PETSC_OFFLOAD_GPU;
3782   PetscFunctionReturn(0);
3783 }
3784 
3785 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3786 {
3787   PetscErrorCode ierr;
3788 
3789   PetscFunctionBegin;
3790   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3791   PetscValidPointer(a,2);
3792   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3793   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3794   *a = NULL;
3795   PetscFunctionReturn(0);
3796 }
3797 
3798 struct IJCompare4
3799 {
3800   __host__ __device__
3801   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3802   {
3803     if (t1.get<0>() < t2.get<0>()) return true;
3804     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3805     return false;
3806   }
3807 };
3808 
3809 struct Shift
3810 {
3811   int _shift;
3812 
3813   Shift(int shift) : _shift(shift) {}
3814   __host__ __device__
3815   inline int operator() (const int &c)
3816   {
3817     return c + _shift;
3818   }
3819 };
3820 
3821 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3822 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3823 {
3824   PetscErrorCode               ierr;
3825   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3826   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3827   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3828   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3829   PetscInt                     Annz,Bnnz;
3830   cusparseStatus_t             stat;
3831   PetscInt                     i,m,n,zero = 0;
3832   cudaError_t                  cerr;
3833 
3834   PetscFunctionBegin;
3835   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3836   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3837   PetscValidPointer(C,4);
3838   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3839   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3840   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3841   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3842   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3843   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3844   if (reuse == MAT_INITIAL_MATRIX) {
3845     m     = A->rmap->n;
3846     n     = A->cmap->n + B->cmap->n;
3847     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3848     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3849     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3850     c     = (Mat_SeqAIJ*)(*C)->data;
3851     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3852     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3853     Ccsr  = new CsrMatrix;
3854     Cmat->cprowIndices      = NULL;
3855     c->compressedrow.use    = PETSC_FALSE;
3856     c->compressedrow.nrows  = 0;
3857     c->compressedrow.i      = NULL;
3858     c->compressedrow.rindex = NULL;
3859     Ccusp->workVector       = NULL;
3860     Ccusp->nrows    = m;
3861     Ccusp->mat      = Cmat;
3862     Ccusp->mat->mat = Ccsr;
3863     Ccsr->num_rows  = m;
3864     Ccsr->num_cols  = n;
3865     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3866     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3867     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3868     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3869     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3870     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3871     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3872     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3873     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3874     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3875     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3876     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3877     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3878     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3879     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3880 
3881     Acsr = (CsrMatrix*)Acusp->mat->mat;
3882     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3883     Annz = (PetscInt)Acsr->column_indices->size();
3884     Bnnz = (PetscInt)Bcsr->column_indices->size();
3885     c->nz = Annz + Bnnz;
3886     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3887     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3888     Ccsr->values = new THRUSTARRAY(c->nz);
3889     Ccsr->num_entries = c->nz;
3890     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3891     if (c->nz) {
3892       auto Acoo = new THRUSTINTARRAY32(Annz);
3893       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3894       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3895       THRUSTINTARRAY32 *Aroff,*Broff;
3896 
3897       if (a->compressedrow.use) { /* need full row offset */
3898         if (!Acusp->rowoffsets_gpu) {
3899           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3900           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3901           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3902         }
3903         Aroff = Acusp->rowoffsets_gpu;
3904       } else Aroff = Acsr->row_offsets;
3905       if (b->compressedrow.use) { /* need full row offset */
3906         if (!Bcusp->rowoffsets_gpu) {
3907           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3908           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3909           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3910         }
3911         Broff = Bcusp->rowoffsets_gpu;
3912       } else Broff = Bcsr->row_offsets;
3913       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3914       stat = cusparseXcsr2coo(Acusp->handle,
3915                               Aroff->data().get(),
3916                               Annz,
3917                               m,
3918                               Acoo->data().get(),
3919                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3920       stat = cusparseXcsr2coo(Bcusp->handle,
3921                               Broff->data().get(),
3922                               Bnnz,
3923                               m,
3924                               Bcoo->data().get(),
3925                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3926       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3927       auto Aperm = thrust::make_constant_iterator(1);
3928       auto Bperm = thrust::make_constant_iterator(0);
3929 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3930       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3931       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3932 #else
3933       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3934       auto Bcib = Bcsr->column_indices->begin();
3935       auto Bcie = Bcsr->column_indices->end();
3936       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3937 #endif
3938       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3939       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3940       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3941       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3942       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3943       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3944       auto p1 = Ccusp->cooPerm->begin();
3945       auto p2 = Ccusp->cooPerm->begin();
3946       thrust::advance(p2,Annz);
3947       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3948 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3949       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3950 #endif
3951       auto cci = thrust::make_counting_iterator(zero);
3952       auto cce = thrust::make_counting_iterator(c->nz);
3953 #if 0 //Errors on SUMMIT cuda 11.1.0
3954       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3955 #else
3956       auto pred = thrust::identity<int>();
3957       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3958       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3959 #endif
3960       stat = cusparseXcoo2csr(Ccusp->handle,
3961                               Ccoo->data().get(),
3962                               c->nz,
3963                               m,
3964                               Ccsr->row_offsets->data().get(),
3965                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3966       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3967       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3968       delete wPerm;
3969       delete Acoo;
3970       delete Bcoo;
3971       delete Ccoo;
3972 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3973       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3974                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3975                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3976                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3977 #endif
3978       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3979         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3980         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3981         CsrMatrix *CcsrT = new CsrMatrix;
3982         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3983         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3984 
3985         Ccusp->transgen = PETSC_TRUE;
3986         CmatT->cprowIndices  = NULL;
3987         CmatT->mat = CcsrT;
3988         CcsrT->num_rows = n;
3989         CcsrT->num_cols = m;
3990         CcsrT->num_entries = c->nz;
3991 
3992         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3993         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3994         CcsrT->values = new THRUSTARRAY(c->nz);
3995 
3996         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3997         auto rT = CcsrT->row_offsets->begin();
3998         if (AT) {
3999           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4000           thrust::advance(rT,-1);
4001         }
4002         if (BT) {
4003           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4004           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4005           thrust::copy(titb,tite,rT);
4006         }
4007         auto cT = CcsrT->column_indices->begin();
4008         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4009         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4010         auto vT = CcsrT->values->begin();
4011         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4012         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4013         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4014         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4015 
4016         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4017         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4018         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4019         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4020         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4021         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4022         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4023         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4024         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4025 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4026         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4027                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4028                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4029                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4030 #endif
4031         Ccusp->matTranspose = CmatT;
4032       }
4033     }
4034 
4035     c->singlemalloc = PETSC_FALSE;
4036     c->free_a       = PETSC_TRUE;
4037     c->free_ij      = PETSC_TRUE;
4038     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4039     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4040     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4041       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4042       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4043       ii   = *Ccsr->row_offsets;
4044       jj   = *Ccsr->column_indices;
4045       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4046       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4047     } else {
4048       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4049       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4050     }
4051     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4052     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4053     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4054     c->maxnz = c->nz;
4055     c->nonzerorowcnt = 0;
4056     c->rmax = 0;
4057     for (i = 0; i < m; i++) {
4058       const PetscInt nn = c->i[i+1] - c->i[i];
4059       c->ilen[i] = c->imax[i] = nn;
4060       c->nonzerorowcnt += (PetscInt)!!nn;
4061       c->rmax = PetscMax(c->rmax,nn);
4062     }
4063     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4064     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4065     (*C)->nonzerostate++;
4066     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4067     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4068     Ccusp->nonzerostate = (*C)->nonzerostate;
4069     (*C)->preallocated  = PETSC_TRUE;
4070   } else {
4071     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4072     c = (Mat_SeqAIJ*)(*C)->data;
4073     if (c->nz) {
4074       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4075       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4076       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4077       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4078       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4079       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4080       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4081       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4082       Acsr = (CsrMatrix*)Acusp->mat->mat;
4083       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4084       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4085       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4086       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4087       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4088       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4089       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4090       auto pmid = Ccusp->cooPerm->begin();
4091       thrust::advance(pmid,Acsr->num_entries);
4092       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4093       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4094                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4095       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4096                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4097       thrust::for_each(zibait,zieait,VecCUDAEquals());
4098       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4099                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4100       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4101                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4102       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4103       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4104         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4105         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4106         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4107         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4108         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4109         auto vT = CcsrT->values->begin();
4110         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4111         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4112       }
4113       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4114       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4115     }
4116   }
4117   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4118   (*C)->assembled     = PETSC_TRUE;
4119   (*C)->was_assembled = PETSC_FALSE;
4120   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4121   PetscFunctionReturn(0);
4122 }
4123 
4124 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4125 {
4126   PetscErrorCode    ierr;
4127   bool              dmem;
4128   const PetscScalar *av;
4129   cudaError_t       cerr;
4130 
4131   PetscFunctionBegin;
4132   dmem = isCudaMem(v);
4133   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4134   if (n && idx) {
4135     THRUSTINTARRAY widx(n);
4136     widx.assign(idx,idx+n);
4137     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4138 
4139     THRUSTARRAY *w = NULL;
4140     thrust::device_ptr<PetscScalar> dv;
4141     if (dmem) {
4142       dv = thrust::device_pointer_cast(v);
4143     } else {
4144       w = new THRUSTARRAY(n);
4145       dv = w->data();
4146     }
4147     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4148 
4149     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4150     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4151     thrust::for_each(zibit,zieit,VecCUDAEquals());
4152     if (w) {
4153       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4154     }
4155     delete w;
4156   } else {
4157     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4158   }
4159   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4160   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4161   PetscFunctionReturn(0);
4162 }
4163