xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 45639126639a93db2facdb9199aa37ba9565a796)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
90 {
91   cusparseStatus_t   stat;
92   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
93 
94   PetscFunctionBegin;
95   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
96   cusparsestruct->stream = stream;
97   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
98   PetscFunctionReturn(0);
99 }
100 
101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
102 {
103   cusparseStatus_t   stat;
104   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
105 
106   PetscFunctionBegin;
107   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
108   if (cusparsestruct->handle != handle) {
109     if (cusparsestruct->handle) {
110       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
111     }
112     cusparsestruct->handle = handle;
113   }
114   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
119 {
120   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
121   PetscBool          flg;
122   PetscErrorCode     ierr;
123 
124   PetscFunctionBegin;
125   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
126   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
127   if (cusparsestruct->handle) cusparsestruct->handle = 0;
128   PetscFunctionReturn(0);
129 }
130 
131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
132 {
133   PetscFunctionBegin;
134   *type = MATSOLVERCUSPARSE;
135   PetscFunctionReturn(0);
136 }
137 
138 /*MC
139   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
140   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
141   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
142   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
143   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
144   algorithms are not recommended. This class does NOT support direct solver operations.
145 
146   Level: beginner
147 
148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
149 M*/
150 
151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
152 {
153   PetscErrorCode ierr;
154   PetscInt       n = A->rmap->n;
155 
156   PetscFunctionBegin;
157   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
158   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
159   (*B)->factortype = ftype;
160   (*B)->useordering = PETSC_TRUE;
161   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
162 
163   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
164     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
165     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
167   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
168     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
170   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
171 
172   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
173   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
178 {
179   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
180 
181   PetscFunctionBegin;
182   switch (op) {
183   case MAT_CUSPARSE_MULT:
184     cusparsestruct->format = format;
185     break;
186   case MAT_CUSPARSE_ALL:
187     cusparsestruct->format = format;
188     break;
189   default:
190     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*@
196    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
197    operation. Only the MatMult operation can use different GPU storage formats
198    for MPIAIJCUSPARSE matrices.
199    Not Collective
200 
201    Input Parameters:
202 +  A - Matrix of type SEQAIJCUSPARSE
203 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
204 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
205 
206    Output Parameter:
207 
208    Level: intermediate
209 
210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
211 @*/
212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
218   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 /*@
223    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
224 
225    Collective on mat
226 
227    Input Parameters:
228 +  A - Matrix of type SEQAIJCUSPARSE
229 -  transgen - the boolean flag
230 
231    Level: intermediate
232 
233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
234 @*/
235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
236 {
237   PetscErrorCode ierr;
238   PetscBool      flg;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
243   if (flg) {
244     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
245 
246     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
247     cusp->transgen = transgen;
248     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
249       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
250     }
251   }
252   PetscFunctionReturn(0);
253 }
254 
255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
256 {
257   PetscErrorCode           ierr;
258   MatCUSPARSEStorageFormat format;
259   PetscBool                flg;
260   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
261 
262   PetscFunctionBegin;
263   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
264   if (A->factortype == MAT_FACTOR_NONE) {
265     PetscBool transgen = cusparsestruct->transgen;
266 
267     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
273 
274     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
275                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
277    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
278     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
279     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
280                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
281     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
282     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
283 
284     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
285     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
286                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
287     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
288 
289     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
290     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
291                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
292     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
293    #endif
294   }
295   ierr = PetscOptionsTail();CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
302   PetscErrorCode               ierr;
303 
304   PetscFunctionBegin;
305   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
306   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
307   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
308   PetscFunctionReturn(0);
309 }
310 
311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
312 {
313   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
314   PetscErrorCode               ierr;
315 
316   PetscFunctionBegin;
317   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
318   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
319   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
324 {
325   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326   PetscErrorCode               ierr;
327 
328   PetscFunctionBegin;
329   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
336 {
337   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338   PetscErrorCode               ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
343   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
344   PetscFunctionReturn(0);
345 }
346 
347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
348 {
349   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
350   PetscInt                          n = A->rmap->n;
351   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
353   cusparseStatus_t                  stat;
354   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
355   const MatScalar                   *aa = a->a,*v;
356   PetscInt                          *AiLo, *AjLo;
357   PetscInt                          i,nz, nzLower, offset, rowOffset;
358   PetscErrorCode                    ierr;
359   cudaError_t                       cerr;
360 
361   PetscFunctionBegin;
362   if (!n) PetscFunctionReturn(0);
363   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
364     try {
365       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
366       nzLower=n+ai[n]-ai[1];
367       if (!loTriFactor) {
368         PetscScalar                       *AALo;
369 
370         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
371 
372         /* Allocate Space for the lower triangular matrix */
373         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
374         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
375 
376         /* Fill the lower triangular matrix */
377         AiLo[0]  = (PetscInt) 0;
378         AiLo[n]  = nzLower;
379         AjLo[0]  = (PetscInt) 0;
380         AALo[0]  = (MatScalar) 1.0;
381         v        = aa;
382         vi       = aj;
383         offset   = 1;
384         rowOffset= 1;
385         for (i=1; i<n; i++) {
386           nz = ai[i+1] - ai[i];
387           /* additional 1 for the term on the diagonal */
388           AiLo[i]    = rowOffset;
389           rowOffset += nz+1;
390 
391           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
392           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
393 
394           offset      += nz;
395           AjLo[offset] = (PetscInt) i;
396           AALo[offset] = (MatScalar) 1.0;
397           offset      += 1;
398 
399           v  += nz;
400           vi += nz;
401         }
402 
403         /* allocate space for the triangular factor information */
404         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
405         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
406         /* Create the matrix description */
407         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
408         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
409        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
410         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
411        #else
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
413        #endif
414         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
415         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
416 
417         /* set the operation */
418         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
419 
420         /* set the matrix */
421         loTriFactor->csrMat = new CsrMatrix;
422         loTriFactor->csrMat->num_rows = n;
423         loTriFactor->csrMat->num_cols = n;
424         loTriFactor->csrMat->num_entries = nzLower;
425 
426         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
427         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
428 
429         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
430         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
431 
432         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
433         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
434 
435         /* Create the solve analysis information */
436         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
437         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
438       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
439         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
440                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
441                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
442                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
443                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
444         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
445       #endif
446 
447         /* perform the solve analysis */
448         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
449                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
450                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
451                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
452                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
453                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
454                                #endif
455 );CHKERRCUSPARSE(stat);
456         cerr = WaitForCUDA();CHKERRCUDA(cerr);
457         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458 
459         /* assign the pointer */
460         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
461         loTriFactor->AA_h = AALo;
462         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
463         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
464         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
465       } else { /* update values only */
466         if (!loTriFactor->AA_h) {
467           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
468         }
469         /* Fill the lower triangular matrix */
470         loTriFactor->AA_h[0]  = 1.0;
471         v        = aa;
472         vi       = aj;
473         offset   = 1;
474         for (i=1; i<n; i++) {
475           nz = ai[i+1] - ai[i];
476           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
477           offset      += nz;
478           loTriFactor->AA_h[offset] = 1.0;
479           offset      += 1;
480           v  += nz;
481         }
482         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
483         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       }
485     } catch(char *ex) {
486       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
487     }
488   }
489   PetscFunctionReturn(0);
490 }
491 
492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
493 {
494   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
495   PetscInt                          n = A->rmap->n;
496   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
497   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
498   cusparseStatus_t                  stat;
499   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
500   const MatScalar                   *aa = a->a,*v;
501   PetscInt                          *AiUp, *AjUp;
502   PetscInt                          i,nz, nzUpper, offset;
503   PetscErrorCode                    ierr;
504   cudaError_t                       cerr;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* next, figure out the number of nonzeros in the upper triangular matrix. */
511       nzUpper = adiag[0]-adiag[n];
512       if (!upTriFactor) {
513         PetscScalar *AAUp;
514 
515         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
516 
517         /* Allocate Space for the upper triangular matrix */
518         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
519         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
520 
521         /* Fill the upper triangular matrix */
522         AiUp[0]=(PetscInt) 0;
523         AiUp[n]=nzUpper;
524         offset = nzUpper;
525         for (i=n-1; i>=0; i--) {
526           v  = aa + adiag[i+1] + 1;
527           vi = aj + adiag[i+1] + 1;
528 
529           /* number of elements NOT on the diagonal */
530           nz = adiag[i] - adiag[i+1]-1;
531 
532           /* decrement the offset */
533           offset -= (nz+1);
534 
535           /* first, set the diagonal elements */
536           AjUp[offset] = (PetscInt) i;
537           AAUp[offset] = (MatScalar)1./v[nz];
538           AiUp[i]      = AiUp[i+1] - (nz+1);
539 
540           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
541           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
542         }
543 
544         /* allocate space for the triangular factor information */
545         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
546         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
547 
548         /* Create the matrix description */
549         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
550         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
551        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
552         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
553        #else
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
555        #endif
556         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
557         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
558 
559         /* set the operation */
560         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
561 
562         /* set the matrix */
563         upTriFactor->csrMat = new CsrMatrix;
564         upTriFactor->csrMat->num_rows = n;
565         upTriFactor->csrMat->num_cols = n;
566         upTriFactor->csrMat->num_entries = nzUpper;
567 
568         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
569         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
570 
571         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
572         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
573 
574         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
575         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
576 
577         /* Create the solve analysis information */
578         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
579         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
581         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
582                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
585                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
586         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
587       #endif
588 
589         /* perform the solve analysis */
590         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
594                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
595                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
596                                #endif
597 );CHKERRCUSPARSE(stat);
598         cerr = WaitForCUDA();CHKERRCUDA(cerr);
599         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
600 
601         /* assign the pointer */
602         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
603         upTriFactor->AA_h = AAUp;
604         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
605         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
606         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
607       } else {
608         if (!upTriFactor->AA_h) {
609           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
610         }
611         /* Fill the upper triangular matrix */
612         offset = nzUpper;
613         for (i=n-1; i>=0; i--) {
614           v  = aa + adiag[i+1] + 1;
615 
616           /* number of elements NOT on the diagonal */
617           nz = adiag[i] - adiag[i+1]-1;
618 
619           /* decrement the offset */
620           offset -= (nz+1);
621 
622           /* first, set the diagonal elements */
623           upTriFactor->AA_h[offset] = 1./v[nz];
624           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
625         }
626         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
627         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
628       }
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           isrow = a->row,iscol = a->icol;
642   PetscBool                    row_identity,col_identity;
643   PetscInt                     n = A->rmap->n;
644 
645   PetscFunctionBegin;
646   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
647   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
648   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
649 
650   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
651   cusparseTriFactors->nnz=a->nz;
652 
653   A->offloadmask = PETSC_OFFLOAD_BOTH;
654   /* lower triangular indices */
655   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
656   if (!row_identity && !cusparseTriFactors->rpermIndices) {
657     const PetscInt *r;
658 
659     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(r, r+n);
662     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665 
666   /* upper triangular indices */
667   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
668   if (!col_identity && !cusparseTriFactors->cpermIndices) {
669     const PetscInt *c;
670 
671     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
672     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->cpermIndices->assign(c, c+n);
674     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
675     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
681 {
682   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
683   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
685   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
686   cusparseStatus_t                  stat;
687   PetscErrorCode                    ierr;
688   cudaError_t                       cerr;
689   PetscInt                          *AiUp, *AjUp;
690   PetscScalar                       *AAUp;
691   PetscScalar                       *AALo;
692   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
693   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
694   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
695   const MatScalar                   *aa = b->a,*v;
696 
697   PetscFunctionBegin;
698   if (!n) PetscFunctionReturn(0);
699   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
700     try {
701       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
702       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
703       if (!upTriFactor && !loTriFactor) {
704         /* Allocate Space for the upper triangular matrix */
705         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
706         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
707 
708         /* Fill the upper triangular matrix */
709         AiUp[0]=(PetscInt) 0;
710         AiUp[n]=nzUpper;
711         offset = 0;
712         for (i=0; i<n; i++) {
713           /* set the pointers */
714           v  = aa + ai[i];
715           vj = aj + ai[i];
716           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
717 
718           /* first, set the diagonal elements */
719           AjUp[offset] = (PetscInt) i;
720           AAUp[offset] = (MatScalar)1.0/v[nz];
721           AiUp[i]      = offset;
722           AALo[offset] = (MatScalar)1.0/v[nz];
723 
724           offset+=1;
725           if (nz>0) {
726             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
727             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
728             for (j=offset; j<offset+nz; j++) {
729               AAUp[j] = -AAUp[j];
730               AALo[j] = AAUp[j]/v[nz];
731             }
732             offset+=nz;
733           }
734         }
735 
736         /* allocate space for the triangular factor information */
737         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
738         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
739 
740         /* Create the matrix description */
741         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
742         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
743        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
744         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
745        #else
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
747        #endif
748         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
749         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
750 
751         /* set the matrix */
752         upTriFactor->csrMat = new CsrMatrix;
753         upTriFactor->csrMat->num_rows = A->rmap->n;
754         upTriFactor->csrMat->num_cols = A->cmap->n;
755         upTriFactor->csrMat->num_entries = a->nz;
756 
757         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
758         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
759 
760         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
761         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
762 
763         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
764         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
765 
766         /* set the operation */
767         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769         /* Create the solve analysis information */
770         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
771         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
772       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
773         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
774                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
775                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
776                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
777                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
778         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
779       #endif
780 
781         /* perform the solve analysis */
782         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
783                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
784                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
785                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
786                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
787                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
788                                 #endif
789 );CHKERRCUSPARSE(stat);
790         cerr = WaitForCUDA();CHKERRCUDA(cerr);
791         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
792 
793         /* assign the pointer */
794         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
795 
796         /* allocate space for the triangular factor information */
797         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
798         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
802         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
805        #else
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
807        #endif
808         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
809         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
810 
811         /* set the operation */
812         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
813 
814         /* set the matrix */
815         loTriFactor->csrMat = new CsrMatrix;
816         loTriFactor->csrMat->num_rows = A->rmap->n;
817         loTriFactor->csrMat->num_cols = A->cmap->n;
818         loTriFactor->csrMat->num_entries = a->nz;
819 
820         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
821         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
822 
823         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
824         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
825 
826         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
827         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
828 
829         /* Create the solve analysis information */
830         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
831         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
834                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
835                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
837                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
838         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
839       #endif
840 
841         /* perform the solve analysis */
842         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
843                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
844                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
845                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
846                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
848                                 #endif
849 );CHKERRCUSPARSE(stat);
850         cerr = WaitForCUDA();CHKERRCUDA(cerr);
851         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
852 
853         /* assign the pointer */
854         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
855 
856         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
857         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
858         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
859       } else {
860         /* Fill the upper triangular matrix */
861         offset = 0;
862         for (i=0; i<n; i++) {
863           /* set the pointers */
864           v  = aa + ai[i];
865           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
866 
867           /* first, set the diagonal elements */
868           AAUp[offset] = 1.0/v[nz];
869           AALo[offset] = 1.0/v[nz];
870 
871           offset+=1;
872           if (nz>0) {
873             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
874             for (j=offset; j<offset+nz; j++) {
875               AAUp[j] = -AAUp[j];
876               AALo[j] = AAUp[j]/v[nz];
877             }
878             offset+=nz;
879           }
880         }
881         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
882         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
883         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
884         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
885         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
886       }
887       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
888       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
889     } catch(char *ex) {
890       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
891     }
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
897 {
898   PetscErrorCode               ierr;
899   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
900   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
901   IS                           ip = a->row;
902   PetscBool                    perm_identity;
903   PetscInt                     n = A->rmap->n;
904 
905   PetscFunctionBegin;
906   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
908   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
909   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
910 
911   A->offloadmask = PETSC_OFFLOAD_BOTH;
912 
913   /* lower triangular indices */
914   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
915   if (!perm_identity) {
916     IS             iip;
917     const PetscInt *irip,*rip;
918 
919     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
920     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
921     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
922     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
923     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
924     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
926     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
927     ierr = ISDestroy(&iip);CHKERRQ(ierr);
928     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
929     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
935 {
936   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
937   IS             isrow = b->row,iscol = b->col;
938   PetscBool      row_identity,col_identity;
939   PetscErrorCode ierr;
940 
941   PetscFunctionBegin;
942   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
943   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
944   B->offloadmask = PETSC_OFFLOAD_CPU;
945   /* determine which version of MatSolve needs to be used. */
946   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
947   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
948   if (row_identity && col_identity) {
949     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
950     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
951     B->ops->matsolve = NULL;
952     B->ops->matsolvetranspose = NULL;
953   } else {
954     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
955     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
956     B->ops->matsolve = NULL;
957     B->ops->matsolvetranspose = NULL;
958   }
959 
960   /* get the triangular factors */
961   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
966 {
967   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
968   IS             ip = b->row;
969   PetscBool      perm_identity;
970   PetscErrorCode ierr;
971 
972   PetscFunctionBegin;
973   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
974   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
975   B->offloadmask = PETSC_OFFLOAD_CPU;
976   /* determine which version of MatSolve needs to be used. */
977   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
978   if (perm_identity) {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   } else {
984     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
985     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
986     B->ops->matsolve = NULL;
987     B->ops->matsolvetranspose = NULL;
988   }
989 
990   /* get the triangular factors */
991   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
992   PetscFunctionReturn(0);
993 }
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
996 {
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1002   cusparseStatus_t                  stat;
1003   cusparseIndexBase_t               indexBase;
1004   cusparseMatrixType_t              matrixType;
1005   cusparseFillMode_t                fillMode;
1006   cusparseDiagType_t                diagType;
1007   cudaError_t                       cerr;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* allocate space for the transpose of the lower triangular factor */
1012   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1013   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1014 
1015   /* set the matrix descriptors of the lower triangular factor */
1016   matrixType = cusparseGetMatType(loTriFactor->descr);
1017   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1018   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1019     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1020   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1021 
1022   /* Create the matrix description */
1023   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1028 
1029   /* set the operation */
1030   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1031 
1032   /* allocate GPU space for the CSC of the lower triangular factor*/
1033   loTriFactorT->csrMat = new CsrMatrix;
1034   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1035   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1036   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1037   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1038   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1039   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1040 
1041   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1043   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                                        loTriFactor->csrMat->values->data().get(),
1046                                        loTriFactor->csrMat->row_offsets->data().get(),
1047                                        loTriFactor->csrMat->column_indices->data().get(),
1048                                        loTriFactorT->csrMat->values->data().get(),
1049                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1050                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1051                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1052   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1053 #endif
1054 
1055   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1056   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1057                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1058                           loTriFactor->csrMat->values->data().get(),
1059                           loTriFactor->csrMat->row_offsets->data().get(),
1060                           loTriFactor->csrMat->column_indices->data().get(),
1061                           loTriFactorT->csrMat->values->data().get(),
1062                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1063                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1064                           CUSPARSE_ACTION_NUMERIC, indexBase,
1065                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1066                         #else
1067                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                           CUSPARSE_ACTION_NUMERIC, indexBase
1069                         #endif
1070 );CHKERRCUSPARSE(stat);
1071   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1072   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1073 
1074   /* Create the solve analysis information */
1075   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1076   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1078   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1080                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1081                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1082                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1083   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1084 #endif
1085 
1086   /* perform the solve analysis */
1087   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1088                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1089                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1091                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1092                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1093                           #endif
1094 );CHKERRCUSPARSE(stat);
1095   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1096   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1097 
1098   /* assign the pointer */
1099   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100 
1101   /*********************************************/
1102   /* Now the Transpose of the Upper Tri Factor */
1103   /*********************************************/
1104 
1105   /* allocate space for the transpose of the upper triangular factor */
1106   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1107   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108 
1109   /* set the matrix descriptors of the upper triangular factor */
1110   matrixType = cusparseGetMatType(upTriFactor->descr);
1111   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1112   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1113     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1114   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115 
1116   /* Create the matrix description */
1117   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122 
1123   /* set the operation */
1124   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125 
1126   /* allocate GPU space for the CSC of the upper triangular factor*/
1127   upTriFactorT->csrMat = new CsrMatrix;
1128   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1129   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1130   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1131   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1132   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1133   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134 
1135   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1137   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1138                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                                 upTriFactor->csrMat->values->data().get(),
1140                                 upTriFactor->csrMat->row_offsets->data().get(),
1141                                 upTriFactor->csrMat->column_indices->data().get(),
1142                                 upTriFactorT->csrMat->values->data().get(),
1143                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1145                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1146   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1147 #endif
1148 
1149   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1150   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1151                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1152                           upTriFactor->csrMat->values->data().get(),
1153                           upTriFactor->csrMat->row_offsets->data().get(),
1154                           upTriFactor->csrMat->column_indices->data().get(),
1155                           upTriFactorT->csrMat->values->data().get(),
1156                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1157                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1158                           CUSPARSE_ACTION_NUMERIC, indexBase,
1159                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1160                         #else
1161                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                           CUSPARSE_ACTION_NUMERIC, indexBase
1163                         #endif
1164 );CHKERRCUSPARSE(stat);
1165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1166   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1167 
1168   /* Create the solve analysis information */
1169   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1170   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1171   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1172   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1176                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1177   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1178   #endif
1179 
1180   /* perform the solve analysis */
1181   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1182                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1183                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1184                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1185                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1186                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1187                           #endif
1188 );CHKERRCUSPARSE(stat);
1189   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1190   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1191 
1192   /* assign the pointer */
1193   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1198 {
1199   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1200   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   cusparseStatus_t             stat;
1204   cusparseIndexBase_t          indexBase;
1205   cudaError_t                  err;
1206   PetscErrorCode               ierr;
1207 
1208   PetscFunctionBegin;
1209   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1212   /* create cusparse matrix */
1213   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1214   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1215   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1216   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1217   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1218 
1219   /* set alpha and beta */
1220   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1221   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1222   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1227 
1228   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1229     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1230     CsrMatrix *matrixT= new CsrMatrix;
1231     matrixT->num_rows = A->cmap->n;
1232     matrixT->num_cols = A->rmap->n;
1233     matrixT->num_entries = a->nz;
1234     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1235     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1236     matrixT->values = new THRUSTARRAY(a->nz);
1237 
1238     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
1239     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1240 
1241     /* compute the transpose, i.e. the CSC */
1242    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1243     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1244                                   A->cmap->n, matrix->num_entries,
1245                                   matrix->values->data().get(),
1246                                   cusparsestruct->rowoffsets_gpu->data().get(),
1247                                   matrix->column_indices->data().get(),
1248                                   matrixT->values->data().get(),
1249                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1251                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1252     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1253    #endif
1254 
1255     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1256                             A->cmap->n, matrix->num_entries,
1257                             matrix->values->data().get(),
1258                             cusparsestruct->rowoffsets_gpu->data().get(),
1259                             matrix->column_indices->data().get(),
1260                             matrixT->values->data().get(),
1261                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1263                             CUSPARSE_ACTION_NUMERIC,indexBase,
1264                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1265                           #else
1266                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1267                             CUSPARSE_ACTION_NUMERIC, indexBase
1268                           #endif
1269 );CHKERRCUSPARSE(stat);
1270     matstructT->mat = matrixT;
1271 
1272    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273     stat = cusparseCreateCsr(&matstructT->matDescr,
1274                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1275                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1276                              matrixT->values->data().get(),
1277                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1278                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1279    #endif
1280   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1281    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1282     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1283    #else
1284     CsrMatrix *temp  = new CsrMatrix;
1285     CsrMatrix *tempT = new CsrMatrix;
1286     /* First convert HYB to CSR */
1287     temp->num_rows = A->rmap->n;
1288     temp->num_cols = A->cmap->n;
1289     temp->num_entries = a->nz;
1290     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1292     temp->values = new THRUSTARRAY(a->nz);
1293 
1294     stat = cusparse_hyb2csr(cusparsestruct->handle,
1295                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1296                             temp->values->data().get(),
1297                             temp->row_offsets->data().get(),
1298                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1299 
1300     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1301     tempT->num_rows = A->rmap->n;
1302     tempT->num_cols = A->cmap->n;
1303     tempT->num_entries = a->nz;
1304     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1305     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1306     tempT->values = new THRUSTARRAY(a->nz);
1307 
1308     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1309                             temp->num_cols, temp->num_entries,
1310                             temp->values->data().get(),
1311                             temp->row_offsets->data().get(),
1312                             temp->column_indices->data().get(),
1313                             tempT->values->data().get(),
1314                             tempT->column_indices->data().get(),
1315                             tempT->row_offsets->data().get(),
1316                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1317 
1318     /* Last, convert CSC to HYB */
1319     cusparseHybMat_t hybMat;
1320     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1321     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1322       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1323     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1324                             matstructT->descr, tempT->values->data().get(),
1325                             tempT->row_offsets->data().get(),
1326                             tempT->column_indices->data().get(),
1327                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1328 
1329     /* assign the pointer */
1330     matstructT->mat = hybMat;
1331     /* delete temporaries */
1332     if (tempT) {
1333       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1334       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1335       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1336       delete (CsrMatrix*) tempT;
1337     }
1338     if (temp) {
1339       if (temp->values) delete (THRUSTARRAY*) temp->values;
1340       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1341       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1342       delete (CsrMatrix*) temp;
1343     }
1344    #endif
1345   }
1346   err  = WaitForCUDA();CHKERRCUDA(err);
1347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1348   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1349   /* the compressed row indices is not used for matTranspose */
1350   matstructT->cprowIndices = NULL;
1351   /* assign the pointer */
1352   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1358 {
1359   PetscInt                              n = xx->map->n;
1360   const PetscScalar                     *barray;
1361   PetscScalar                           *xarray;
1362   thrust::device_ptr<const PetscScalar> bGPU;
1363   thrust::device_ptr<PetscScalar>       xGPU;
1364   cusparseStatus_t                      stat;
1365   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1366   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1367   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1368   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1369   PetscErrorCode                        ierr;
1370   cudaError_t                           cerr;
1371 
1372   PetscFunctionBegin;
1373   /* Analyze the matrix and create the transpose ... on the fly */
1374   if (!loTriFactorT && !upTriFactorT) {
1375     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1376     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1377     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1378   }
1379 
1380   /* Get the GPU pointers */
1381   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1382   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1383   xGPU = thrust::device_pointer_cast(xarray);
1384   bGPU = thrust::device_pointer_cast(barray);
1385 
1386   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1387   /* First, reorder with the row permutation */
1388   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1389                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1390                xGPU);
1391 
1392   /* First, solve U */
1393   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1394                         upTriFactorT->csrMat->num_rows,
1395                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1396                         upTriFactorT->csrMat->num_entries,
1397                       #endif
1398                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1399                         upTriFactorT->csrMat->values->data().get(),
1400                         upTriFactorT->csrMat->row_offsets->data().get(),
1401                         upTriFactorT->csrMat->column_indices->data().get(),
1402                         upTriFactorT->solveInfo,
1403                         xarray, tempGPU->data().get()
1404                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1405                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1406                       #endif
1407 );CHKERRCUSPARSE(stat);
1408 
1409   /* Then, solve L */
1410   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1411                         loTriFactorT->csrMat->num_rows,
1412                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1413                         loTriFactorT->csrMat->num_entries,
1414                       #endif
1415                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1416                         loTriFactorT->csrMat->values->data().get(),
1417                         loTriFactorT->csrMat->row_offsets->data().get(),
1418                         loTriFactorT->csrMat->column_indices->data().get(),
1419                         loTriFactorT->solveInfo,
1420                         tempGPU->data().get(), xarray
1421                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1422                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1423                       #endif
1424 );CHKERRCUSPARSE(stat);
1425 
1426   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1427   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1428                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1429                tempGPU->begin());
1430 
1431   /* Copy the temporary to the full solution. */
1432   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1433 
1434   /* restore */
1435   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1437   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1438   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   const PetscScalar                 *barray;
1446   PetscScalar                       *xarray;
1447   cusparseStatus_t                  stat;
1448   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1449   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1450   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1451   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1452   PetscErrorCode                    ierr;
1453   cudaError_t                       cerr;
1454 
1455   PetscFunctionBegin;
1456   /* Analyze the matrix and create the transpose ... on the fly */
1457   if (!loTriFactorT && !upTriFactorT) {
1458     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1459     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   }
1462 
1463   /* Get the GPU pointers */
1464   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, solve U */
1469   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1470                         upTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         upTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1475                         upTriFactorT->csrMat->values->data().get(),
1476                         upTriFactorT->csrMat->row_offsets->data().get(),
1477                         upTriFactorT->csrMat->column_indices->data().get(),
1478                         upTriFactorT->solveInfo,
1479                         barray, tempGPU->data().get()
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Then, solve L */
1486   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1487                         loTriFactorT->csrMat->num_rows,
1488                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1489                         loTriFactorT->csrMat->num_entries,
1490                       #endif
1491                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1492                         loTriFactorT->csrMat->values->data().get(),
1493                         loTriFactorT->csrMat->row_offsets->data().get(),
1494                         loTriFactorT->csrMat->column_indices->data().get(),
1495                         loTriFactorT->solveInfo,
1496                         tempGPU->data().get(), xarray
1497                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1498                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1499                       #endif
1500 );CHKERRCUSPARSE(stat);
1501 
1502   /* restore */
1503   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1504   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1505   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1506   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1512 {
1513   const PetscScalar                     *barray;
1514   PetscScalar                           *xarray;
1515   thrust::device_ptr<const PetscScalar> bGPU;
1516   thrust::device_ptr<PetscScalar>       xGPU;
1517   cusparseStatus_t                      stat;
1518   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1519   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1520   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1521   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1522   PetscErrorCode                        ierr;
1523   cudaError_t                           cerr;
1524 
1525   PetscFunctionBegin;
1526 
1527   /* Get the GPU pointers */
1528   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530   xGPU = thrust::device_pointer_cast(xarray);
1531   bGPU = thrust::device_pointer_cast(barray);
1532 
1533   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534   /* First, reorder with the row permutation */
1535   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1536                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1537                tempGPU->begin());
1538 
1539   /* Next, solve L */
1540   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1541                         loTriFactor->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         loTriFactor->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1546                         loTriFactor->csrMat->values->data().get(),
1547                         loTriFactor->csrMat->row_offsets->data().get(),
1548                         loTriFactor->csrMat->column_indices->data().get(),
1549                         loTriFactor->solveInfo,
1550                         tempGPU->data().get(), xarray
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1553                       #endif
1554 );CHKERRCUSPARSE(stat);
1555 
1556   /* Then, solve U */
1557   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1558                         upTriFactor->csrMat->num_rows,
1559                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1560                         upTriFactor->csrMat->num_entries,
1561                       #endif
1562                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1563                         upTriFactor->csrMat->values->data().get(),
1564                         upTriFactor->csrMat->row_offsets->data().get(),
1565                         upTriFactor->csrMat->column_indices->data().get(),
1566                         upTriFactor->solveInfo,
1567                         xarray, tempGPU->data().get()
1568                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1570                       #endif
1571 );CHKERRCUSPARSE(stat);
1572 
1573   /* Last, reorder with the column permutation */
1574   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1576                xGPU);
1577 
1578   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1581   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   cusparseStatus_t                  stat;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595   PetscErrorCode                    ierr;
1596   cudaError_t                       cerr;
1597 
1598   PetscFunctionBegin;
1599   /* Get the GPU pointers */
1600   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1601   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1602 
1603   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1604   /* First, solve L */
1605   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1606                         loTriFactor->csrMat->num_rows,
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         loTriFactor->csrMat->num_entries,
1609                       #endif
1610                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1611                         loTriFactor->csrMat->values->data().get(),
1612                         loTriFactor->csrMat->row_offsets->data().get(),
1613                         loTriFactor->csrMat->column_indices->data().get(),
1614                         loTriFactor->solveInfo,
1615                         barray, tempGPU->data().get()
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1618                       #endif
1619 );CHKERRCUSPARSE(stat);
1620 
1621   /* Next, solve U */
1622   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1623                         upTriFactor->csrMat->num_rows,
1624                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1625                         upTriFactor->csrMat->num_entries,
1626                       #endif
1627                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1628                         upTriFactor->csrMat->values->data().get(),
1629                         upTriFactor->csrMat->row_offsets->data().get(),
1630                         upTriFactor->csrMat->column_indices->data().get(),
1631                         upTriFactor->solveInfo,
1632                         tempGPU->data().get(), xarray
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1635                       #endif
1636 );CHKERRCUSPARSE(stat);
1637 
1638   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1639   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1641   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1642   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1650   cudaError_t        cerr;
1651   PetscErrorCode     ierr;
1652 
1653   PetscFunctionBegin;
1654   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1655     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1656 
1657     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1658     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1659     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1660     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1661     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1662     A->offloadmask = PETSC_OFFLOAD_BOTH;
1663   }
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1668 {
1669   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1674   *array = a->a;
1675   A->offloadmask = PETSC_OFFLOAD_CPU;
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1680 {
1681   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1683   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1684   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1685   PetscErrorCode               ierr;
1686   cusparseStatus_t             stat;
1687   PetscBool                    both = PETSC_TRUE;
1688   cudaError_t                  err;
1689 
1690   PetscFunctionBegin;
1691   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1692   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1693     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1694       /* Copy values only */
1695       CsrMatrix *matrix,*matrixT;
1696       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1697 
1698       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
1699       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1700       matrix->values->assign(a->a, a->a+a->nz);
1701       err  = WaitForCUDA();CHKERRCUDA(err);
1702       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1703       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1704 
1705       /* Update matT when it was built before */
1706       if (cusparsestruct->matTranspose) {
1707         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1708         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1709         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1710         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1711                             A->cmap->n, matrix->num_entries,
1712                             matrix->values->data().get(),
1713                             cusparsestruct->rowoffsets_gpu->data().get(),
1714                             matrix->column_indices->data().get(),
1715                             matrixT->values->data().get(),
1716                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1717                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1718                             CUSPARSE_ACTION_NUMERIC,indexBase,
1719                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1720                           #else
1721                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1722                             CUSPARSE_ACTION_NUMERIC, indexBase
1723                           #endif
1724 );CHKERRCUSPARSE(stat);
1725         err  = WaitForCUDA();CHKERRCUDA(err);
1726         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1727       }
1728     } else {
1729       PetscInt nnz;
1730       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1731       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1732       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1733       delete cusparsestruct->workVector;
1734       delete cusparsestruct->rowoffsets_gpu;
1735       try {
1736         if (a->compressedrow.use) {
1737           m    = a->compressedrow.nrows;
1738           ii   = a->compressedrow.i;
1739           ridx = a->compressedrow.rindex;
1740         } else {
1741           m    = A->rmap->n;
1742           ii   = a->i;
1743           ridx = NULL;
1744         }
1745         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1746         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1747         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1748         else nnz = a->nz;
1749 
1750         /* create cusparse matrix */
1751         cusparsestruct->nrows = m;
1752         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1753         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1754         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1755         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1756 
1757         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1758         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1759         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1760         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1761         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1762         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1763         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1764 
1765         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1766         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1767           /* set the matrix */
1768           CsrMatrix *mat= new CsrMatrix;
1769           mat->num_rows = m;
1770           mat->num_cols = A->cmap->n;
1771           mat->num_entries = nnz;
1772           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1773           mat->row_offsets->assign(ii, ii + m+1);
1774 
1775           mat->column_indices = new THRUSTINTARRAY32(nnz);
1776           mat->column_indices->assign(a->j, a->j+nnz);
1777 
1778           mat->values = new THRUSTARRAY(nnz);
1779           if (a->a) mat->values->assign(a->a, a->a+nnz);
1780 
1781           /* assign the pointer */
1782           matstruct->mat = mat;
1783          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1784           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1785             stat = cusparseCreateCsr(&matstruct->matDescr,
1786                                     mat->num_rows, mat->num_cols, mat->num_entries,
1787                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1788                                     mat->values->data().get(),
1789                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1790                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1791           }
1792          #endif
1793         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1794          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1795           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1796          #else
1797           CsrMatrix *mat= new CsrMatrix;
1798           mat->num_rows = m;
1799           mat->num_cols = A->cmap->n;
1800           mat->num_entries = nnz;
1801           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1802           mat->row_offsets->assign(ii, ii + m+1);
1803 
1804           mat->column_indices = new THRUSTINTARRAY32(nnz);
1805           mat->column_indices->assign(a->j, a->j+nnz);
1806 
1807           mat->values = new THRUSTARRAY(nnz);
1808           if (a->a) mat->values->assign(a->a, a->a+nnz);
1809 
1810           cusparseHybMat_t hybMat;
1811           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1812           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1813             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1814           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1815               matstruct->descr, mat->values->data().get(),
1816               mat->row_offsets->data().get(),
1817               mat->column_indices->data().get(),
1818               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1819           /* assign the pointer */
1820           matstruct->mat = hybMat;
1821 
1822           if (mat) {
1823             if (mat->values) delete (THRUSTARRAY*)mat->values;
1824             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1825             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1826             delete (CsrMatrix*)mat;
1827           }
1828          #endif
1829         }
1830 
1831         /* assign the compressed row indices */
1832         if (a->compressedrow.use) {
1833           cusparsestruct->workVector = new THRUSTARRAY(m);
1834           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1835           matstruct->cprowIndices->assign(ridx,ridx+m);
1836           tmp = m;
1837         } else {
1838           cusparsestruct->workVector = NULL;
1839           matstruct->cprowIndices    = NULL;
1840           tmp = 0;
1841         }
1842         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1843 
1844         /* assign the pointer */
1845         cusparsestruct->mat = matstruct;
1846       } catch(char *ex) {
1847         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1848       }
1849       err  = WaitForCUDA();CHKERRCUDA(err);
1850       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1851       cusparsestruct->nonzerostate = A->nonzerostate;
1852     }
1853     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
1854   }
1855   PetscFunctionReturn(0);
1856 }
1857 
1858 struct VecCUDAPlusEquals
1859 {
1860   template <typename Tuple>
1861   __host__ __device__
1862   void operator()(Tuple t)
1863   {
1864     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1865   }
1866 };
1867 
1868 struct VecCUDAEquals
1869 {
1870   template <typename Tuple>
1871   __host__ __device__
1872   void operator()(Tuple t)
1873   {
1874     thrust::get<1>(t) = thrust::get<0>(t);
1875   }
1876 };
1877 
1878 struct VecCUDAEqualsReverse
1879 {
1880   template <typename Tuple>
1881   __host__ __device__
1882   void operator()(Tuple t)
1883   {
1884     thrust::get<0>(t) = thrust::get<1>(t);
1885   }
1886 };
1887 
1888 struct MatMatCusparse {
1889   PetscBool             cisdense;
1890   PetscScalar           *Bt;
1891   Mat                   X;
1892   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1893   PetscLogDouble        flops;
1894   CsrMatrix             *Bcsr;
1895 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1896   cusparseSpMatDescr_t  matSpBDescr;
1897   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1898   cusparseDnMatDescr_t  matBDescr;
1899   cusparseDnMatDescr_t  matCDescr;
1900   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1901   size_t                mmBufferSize;
1902   void                  *mmBuffer;
1903   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1904   cusparseSpGEMMDescr_t spgemmDesc;
1905 #endif
1906 };
1907 
1908 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1909 {
1910   PetscErrorCode   ierr;
1911   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1912   cudaError_t      cerr;
1913  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1914   cusparseStatus_t stat;
1915  #endif
1916 
1917   PetscFunctionBegin;
1918   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1919   delete mmdata->Bcsr;
1920  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1921   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1922   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1923   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1924   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1925   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1926   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1927  #endif
1928   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1929   ierr = PetscFree(data);CHKERRQ(ierr);
1930   PetscFunctionReturn(0);
1931 }
1932 
1933 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1934 
1935 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1936 {
1937   Mat_Product                  *product = C->product;
1938   Mat                          A,B;
1939   PetscInt                     m,n,blda,clda;
1940   PetscBool                    flg,biscuda;
1941   Mat_SeqAIJCUSPARSE           *cusp;
1942   cusparseStatus_t             stat;
1943   cusparseOperation_t          opA;
1944   const PetscScalar            *barray;
1945   PetscScalar                  *carray;
1946   PetscErrorCode               ierr;
1947   MatMatCusparse               *mmdata;
1948   Mat_SeqAIJCUSPARSEMultStruct *mat;
1949   CsrMatrix                    *csrmat;
1950   cudaError_t                  cerr;
1951 
1952   PetscFunctionBegin;
1953   MatCheckProduct(C,1);
1954   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1955   mmdata = (MatMatCusparse*)product->data;
1956   A    = product->A;
1957   B    = product->B;
1958   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1959   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1960   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1961      Instead of silently accepting the wrong answer, I prefer to raise the error */
1962   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1963   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1964   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1965   switch (product->type) {
1966   case MATPRODUCT_AB:
1967   case MATPRODUCT_PtAP:
1968     mat = cusp->mat;
1969     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1970     m   = A->rmap->n;
1971     n   = B->cmap->n;
1972     break;
1973   case MATPRODUCT_AtB:
1974     if (!cusp->transgen) {
1975       mat = cusp->mat;
1976       opA = CUSPARSE_OPERATION_TRANSPOSE;
1977     } else {
1978       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1979       mat  = cusp->matTranspose;
1980       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1981     }
1982     m = A->cmap->n;
1983     n = B->cmap->n;
1984     break;
1985   case MATPRODUCT_ABt:
1986   case MATPRODUCT_RARt:
1987     mat = cusp->mat;
1988     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1989     m   = A->rmap->n;
1990     n   = B->rmap->n;
1991     break;
1992   default:
1993     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1994   }
1995   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1996   csrmat = (CsrMatrix*)mat->mat;
1997   /* if the user passed a CPU matrix, copy the data to the GPU */
1998   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1999   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2000   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2001 
2002   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2003   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2004     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2005     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2006   } else {
2007     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2008     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2009   }
2010 
2011   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2012  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2013   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2014   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2015   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2016     size_t mmBufferSize;
2017     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2018     if (!mmdata->matBDescr) {
2019       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2020       mmdata->Blda = blda;
2021     }
2022 
2023     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2024     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2025       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2026       mmdata->Clda = clda;
2027     }
2028 
2029     if (!mat->matDescr) {
2030       stat = cusparseCreateCsr(&mat->matDescr,
2031                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2032                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2033                                csrmat->values->data().get(),
2034                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2035                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2036     }
2037     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2038                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2039                                    mmdata->matCDescr,cusparse_scalartype,
2040                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2041     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2042       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2043       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2044       mmdata->mmBufferSize = mmBufferSize;
2045     }
2046     mmdata->initialized = PETSC_TRUE;
2047   } else {
2048     /* to be safe, always update pointers of the mats */
2049     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2050     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2051     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2052   }
2053 
2054   /* do cusparseSpMM, which supports transpose on B */
2055   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2056                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2057                       mmdata->matCDescr,cusparse_scalartype,
2058                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2059  #else
2060   PetscInt k;
2061   /* cusparseXcsrmm does not support transpose on B */
2062   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2063     cublasHandle_t cublasv2handle;
2064     cublasStatus_t cerr;
2065 
2066     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2067     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2068                        B->cmap->n,B->rmap->n,
2069                        &PETSC_CUSPARSE_ONE ,barray,blda,
2070                        &PETSC_CUSPARSE_ZERO,barray,blda,
2071                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2072     blda = B->cmap->n;
2073     k    = B->cmap->n;
2074   } else {
2075     k    = B->rmap->n;
2076   }
2077 
2078   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2079   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2080                            csrmat->num_entries,mat->alpha_one,mat->descr,
2081                            csrmat->values->data().get(),
2082                            csrmat->row_offsets->data().get(),
2083                            csrmat->column_indices->data().get(),
2084                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2085                            carray,clda);CHKERRCUSPARSE(stat);
2086  #endif
2087   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2088   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2089   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2090   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2091   if (product->type == MATPRODUCT_RARt) {
2092     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2093     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2094   } else if (product->type == MATPRODUCT_PtAP) {
2095     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2097   } else {
2098     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2099   }
2100   if (mmdata->cisdense) {
2101     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2102   }
2103   if (!biscuda) {
2104     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2105   }
2106   PetscFunctionReturn(0);
2107 }
2108 
2109 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2110 {
2111   Mat_Product        *product = C->product;
2112   Mat                A,B;
2113   PetscInt           m,n;
2114   PetscBool          cisdense,flg;
2115   PetscErrorCode     ierr;
2116   MatMatCusparse     *mmdata;
2117   Mat_SeqAIJCUSPARSE *cusp;
2118 
2119   PetscFunctionBegin;
2120   MatCheckProduct(C,1);
2121   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2122   A    = product->A;
2123   B    = product->B;
2124   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2125   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2126   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2127   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2128   switch (product->type) {
2129   case MATPRODUCT_AB:
2130     m = A->rmap->n;
2131     n = B->cmap->n;
2132     break;
2133   case MATPRODUCT_AtB:
2134     m = A->cmap->n;
2135     n = B->cmap->n;
2136     break;
2137   case MATPRODUCT_ABt:
2138     m = A->rmap->n;
2139     n = B->rmap->n;
2140     break;
2141   case MATPRODUCT_PtAP:
2142     m = B->cmap->n;
2143     n = B->cmap->n;
2144     break;
2145   case MATPRODUCT_RARt:
2146     m = B->rmap->n;
2147     n = B->rmap->n;
2148     break;
2149   default:
2150     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2151   }
2152   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2153   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2154   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2155   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2156 
2157   /* product data */
2158   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2159   mmdata->cisdense = cisdense;
2160  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2161   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2162   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2163     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2164   }
2165  #endif
2166   /* for these products we need intermediate storage */
2167   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2168     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2169     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2170     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2171       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2172     } else {
2173       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2174     }
2175   }
2176   C->product->data    = mmdata;
2177   C->product->destroy = MatDestroy_MatMatCusparse;
2178 
2179   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2180   PetscFunctionReturn(0);
2181 }
2182 
2183 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2184 {
2185   Mat_Product                  *product = C->product;
2186   Mat                          A,B;
2187   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2188   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2189   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2190   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2191   PetscBool                    flg;
2192   PetscErrorCode               ierr;
2193   cusparseStatus_t             stat;
2194   cudaError_t                  cerr;
2195   MatProductType               ptype;
2196   MatMatCusparse               *mmdata;
2197 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2198   cusparseSpMatDescr_t         BmatSpDescr;
2199 #endif
2200 
2201   PetscFunctionBegin;
2202   MatCheckProduct(C,1);
2203   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2204   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2205   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2206   mmdata = (MatMatCusparse*)C->product->data;
2207   A = product->A;
2208   B = product->B;
2209   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2210     mmdata->reusesym = PETSC_FALSE;
2211     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2212     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2213     Cmat = Ccusp->mat;
2214     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2215     Ccsr = (CsrMatrix*)Cmat->mat;
2216     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2217     goto finalize;
2218   }
2219   if (!c->nz) goto finalize;
2220   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2221   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2222   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2224   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2225   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2226   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2227   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2228   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2229   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2230   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2231   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2233   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2234 
2235   ptype = product->type;
2236   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2237   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2238   switch (ptype) {
2239   case MATPRODUCT_AB:
2240     Amat = Acusp->mat;
2241     Bmat = Bcusp->mat;
2242     break;
2243   case MATPRODUCT_AtB:
2244     Amat = Acusp->matTranspose;
2245     Bmat = Bcusp->mat;
2246     break;
2247   case MATPRODUCT_ABt:
2248     Amat = Acusp->mat;
2249     Bmat = Bcusp->matTranspose;
2250     break;
2251   default:
2252     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2253   }
2254   Cmat = Ccusp->mat;
2255   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2256   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2257   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2258   Acsr = (CsrMatrix*)Amat->mat;
2259   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2260   Ccsr = (CsrMatrix*)Cmat->mat;
2261   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2262   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2263   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2264   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2265 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2266   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2267   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2268                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2269                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2270                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2271   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2272                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2273                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2274 #else
2275   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2276                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2277                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2278                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2279                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2280 #endif
2281   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2282   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2283   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2284   C->offloadmask = PETSC_OFFLOAD_GPU;
2285 finalize:
2286   /* shorter version of MatAssemblyEnd_SeqAIJ */
2287   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2288   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2289   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2290   c->reallocs         = 0;
2291   C->info.mallocs    += 0;
2292   C->info.nz_unneeded = 0;
2293   C->assembled = C->was_assembled = PETSC_TRUE;
2294   C->num_ass++;
2295   PetscFunctionReturn(0);
2296 }
2297 
2298 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2299 {
2300   Mat_Product                  *product = C->product;
2301   Mat                          A,B;
2302   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2303   Mat_SeqAIJ                   *a,*b,*c;
2304   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2305   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2306   PetscInt                     i,j,m,n,k;
2307   PetscBool                    flg;
2308   PetscErrorCode               ierr;
2309   cusparseStatus_t             stat;
2310   cudaError_t                  cerr;
2311   MatProductType               ptype;
2312   MatMatCusparse               *mmdata;
2313   PetscLogDouble               flops;
2314   PetscBool                    biscompressed,ciscompressed;
2315 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2316   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2317   size_t                       bufSize2;
2318   cusparseSpMatDescr_t         BmatSpDescr;
2319 #else
2320   int                          cnz;
2321 #endif
2322 
2323   PetscFunctionBegin;
2324   MatCheckProduct(C,1);
2325   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2326   A    = product->A;
2327   B    = product->B;
2328   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2329   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2330   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2332   a = (Mat_SeqAIJ*)A->data;
2333   b = (Mat_SeqAIJ*)B->data;
2334   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2335   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2336   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2337   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2338 
2339   /* product data */
2340   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2341   C->product->data    = mmdata;
2342   C->product->destroy = MatDestroy_MatMatCusparse;
2343 
2344   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2345   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2346   ptype = product->type;
2347   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2348   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2349   biscompressed = PETSC_FALSE;
2350   ciscompressed = PETSC_FALSE;
2351   switch (ptype) {
2352   case MATPRODUCT_AB:
2353     m = A->rmap->n;
2354     n = B->cmap->n;
2355     k = A->cmap->n;
2356     Amat = Acusp->mat;
2357     Bmat = Bcusp->mat;
2358     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2359     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2360     break;
2361   case MATPRODUCT_AtB:
2362     m = A->cmap->n;
2363     n = B->cmap->n;
2364     k = A->rmap->n;
2365     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2366     Amat = Acusp->matTranspose;
2367     Bmat = Bcusp->mat;
2368     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2369     break;
2370   case MATPRODUCT_ABt:
2371     m = A->rmap->n;
2372     n = B->rmap->n;
2373     k = A->cmap->n;
2374     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2375     Amat = Acusp->mat;
2376     Bmat = Bcusp->matTranspose;
2377     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2378     break;
2379   default:
2380     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2381   }
2382 
2383   /* create cusparse matrix */
2384   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2385   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2386   c     = (Mat_SeqAIJ*)C->data;
2387   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2389   Ccsr  = new CsrMatrix;
2390 
2391   c->compressedrow.use = ciscompressed;
2392   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2393     c->compressedrow.nrows = a->compressedrow.nrows;
2394     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2395     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2396     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2397     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2398     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2399   } else {
2400     c->compressedrow.nrows  = 0;
2401     c->compressedrow.i      = NULL;
2402     c->compressedrow.rindex = NULL;
2403     Ccusp->workVector       = NULL;
2404     Cmat->cprowIndices      = NULL;
2405   }
2406   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2407   Ccusp->mat      = Cmat;
2408   Ccusp->mat->mat = Ccsr;
2409   Ccsr->num_rows    = Ccusp->nrows;
2410   Ccsr->num_cols    = n;
2411   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2412   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2413   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2414   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2415   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2416   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2417   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2418   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2419   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2420   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2422     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2423     c->nz = 0;
2424     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2425     Ccsr->values = new THRUSTARRAY(c->nz);
2426     goto finalizesym;
2427   }
2428 
2429   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2430   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2431   Acsr = (CsrMatrix*)Amat->mat;
2432   if (!biscompressed) {
2433     Bcsr = (CsrMatrix*)Bmat->mat;
2434 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2435     BmatSpDescr = Bmat->matDescr;
2436 #endif
2437   } else { /* we need to use row offsets for the full matrix */
2438     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2439     Bcsr = new CsrMatrix;
2440     Bcsr->num_rows       = B->rmap->n;
2441     Bcsr->num_cols       = cBcsr->num_cols;
2442     Bcsr->num_entries    = cBcsr->num_entries;
2443     Bcsr->column_indices = cBcsr->column_indices;
2444     Bcsr->values         = cBcsr->values;
2445     if (!Bcusp->rowoffsets_gpu) {
2446       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2447       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2448       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2449     }
2450     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2451     mmdata->Bcsr = Bcsr;
2452 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2453     if (Bcsr->num_rows && Bcsr->num_cols) {
2454       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2455                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2456                                Bcsr->values->data().get(),
2457                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2458                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2459     }
2460     BmatSpDescr = mmdata->matSpBDescr;
2461 #endif
2462   }
2463   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2464   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2465   /* precompute flops count */
2466   if (ptype == MATPRODUCT_AB) {
2467     for (i=0, flops = 0; i<A->rmap->n; i++) {
2468       const PetscInt st = a->i[i];
2469       const PetscInt en = a->i[i+1];
2470       for (j=st; j<en; j++) {
2471         const PetscInt brow = a->j[j];
2472         flops += 2.*(b->i[brow+1] - b->i[brow]);
2473       }
2474     }
2475   } else if (ptype == MATPRODUCT_AtB) {
2476     for (i=0, flops = 0; i<A->rmap->n; i++) {
2477       const PetscInt anzi = a->i[i+1] - a->i[i];
2478       const PetscInt bnzi = b->i[i+1] - b->i[i];
2479       flops += (2.*anzi)*bnzi;
2480     }
2481   } else { /* TODO */
2482     flops = 0.;
2483   }
2484 
2485   mmdata->flops = flops;
2486   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2487 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2488   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2489   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2490                            NULL, NULL, NULL,
2491                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2492                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2493   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2494   /* ask bufferSize bytes for external memory */
2495   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2496                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2497                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2498                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2499   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2500   /* inspect the matrices A and B to understand the memory requirement for the next step */
2501   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2502                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2503                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2504                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2505   /* ask bufferSize again bytes for external memory */
2506   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2507                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2508                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2509                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2510   /* The CUSPARSE documentation is not clear, nor the API
2511      We need both buffers to perform the operations properly!
2512      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2513      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2514      is stored in the descriptor! What a messy API... */
2515   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2516   /* compute the intermediate product of A * B */
2517   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2518                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2519                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2520                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2521   /* get matrix C non-zero entries C_nnz1 */
2522   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2523   c->nz = (PetscInt) C_nnz1;
2524   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2525   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2526   Ccsr->values = new THRUSTARRAY(c->nz);
2527   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2528   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2529                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2530   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2532                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2533 #else
2534   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2535   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2536                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2537                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2538                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2539                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2540   c->nz = cnz;
2541   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2542   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2543   Ccsr->values = new THRUSTARRAY(c->nz);
2544   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545 
2546   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2547   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2548      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2549      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2550   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2551                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2552                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2553                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2554                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2555 #endif
2556   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2557   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2558   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2559 finalizesym:
2560   c->singlemalloc = PETSC_FALSE;
2561   c->free_a       = PETSC_TRUE;
2562   c->free_ij      = PETSC_TRUE;
2563   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2564   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2565   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2566     PetscInt *d_i = c->i;
2567     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2568     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2569     ii   = *Ccsr->row_offsets;
2570     jj   = *Ccsr->column_indices;
2571     if (ciscompressed) d_i = c->compressedrow.i;
2572     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2573     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2574   } else {
2575     PetscInt *d_i = c->i;
2576     if (ciscompressed) d_i = c->compressedrow.i;
2577     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2579   }
2580   if (ciscompressed) { /* need to expand host row offsets */
2581     PetscInt r = 0;
2582     c->i[0] = 0;
2583     for (k = 0; k < c->compressedrow.nrows; k++) {
2584       const PetscInt next = c->compressedrow.rindex[k];
2585       const PetscInt old = c->compressedrow.i[k];
2586       for (; r < next; r++) c->i[r+1] = old;
2587     }
2588     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2589   }
2590   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2591   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2592   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2593   c->maxnz = c->nz;
2594   c->nonzerorowcnt = 0;
2595   c->rmax = 0;
2596   for (k = 0; k < m; k++) {
2597     const PetscInt nn = c->i[k+1] - c->i[k];
2598     c->ilen[k] = c->imax[k] = nn;
2599     c->nonzerorowcnt += (PetscInt)!!nn;
2600     c->rmax = PetscMax(c->rmax,nn);
2601   }
2602   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2603   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2604   Ccsr->num_entries = c->nz;
2605 
2606   C->nonzerostate++;
2607   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2608   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2609   Ccusp->nonzerostate = C->nonzerostate;
2610   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2611   C->preallocated  = PETSC_TRUE;
2612   C->assembled     = PETSC_FALSE;
2613   C->was_assembled = PETSC_FALSE;
2614   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2615     mmdata->reusesym = PETSC_TRUE;
2616     C->offloadmask   = PETSC_OFFLOAD_GPU;
2617   }
2618   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2619   PetscFunctionReturn(0);
2620 }
2621 
2622 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2623 
2624 /* handles sparse or dense B */
2625 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2626 {
2627   Mat_Product    *product = mat->product;
2628   PetscErrorCode ierr;
2629   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2630 
2631   PetscFunctionBegin;
2632   MatCheckProduct(mat,1);
2633   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2634   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2635     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2636   }
2637   if (product->type == MATPRODUCT_ABC) {
2638     Ciscusp = PETSC_FALSE;
2639     if (!product->C->boundtocpu) {
2640       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2641     }
2642   }
2643   if (isdense) {
2644     switch (product->type) {
2645     case MATPRODUCT_AB:
2646     case MATPRODUCT_AtB:
2647     case MATPRODUCT_ABt:
2648     case MATPRODUCT_PtAP:
2649     case MATPRODUCT_RARt:
2650      if (product->A->boundtocpu) {
2651         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2652       } else {
2653         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2654       }
2655       break;
2656     case MATPRODUCT_ABC:
2657       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2658       break;
2659     default:
2660       break;
2661     }
2662   } else if (Biscusp && Ciscusp) {
2663     switch (product->type) {
2664     case MATPRODUCT_AB:
2665     case MATPRODUCT_AtB:
2666     case MATPRODUCT_ABt:
2667       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2668       break;
2669     case MATPRODUCT_PtAP:
2670     case MATPRODUCT_RARt:
2671     case MATPRODUCT_ABC:
2672       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2673       break;
2674     default:
2675       break;
2676     }
2677   } else { /* fallback for AIJ */
2678     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2679   }
2680   PetscFunctionReturn(0);
2681 }
2682 
2683 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2684 {
2685   PetscErrorCode ierr;
2686 
2687   PetscFunctionBegin;
2688   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2689   PetscFunctionReturn(0);
2690 }
2691 
2692 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2693 {
2694   PetscErrorCode ierr;
2695 
2696   PetscFunctionBegin;
2697   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2698   PetscFunctionReturn(0);
2699 }
2700 
2701 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2702 {
2703   PetscErrorCode ierr;
2704 
2705   PetscFunctionBegin;
2706   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2707   PetscFunctionReturn(0);
2708 }
2709 
2710 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2711 {
2712   PetscErrorCode ierr;
2713 
2714   PetscFunctionBegin;
2715   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2716   PetscFunctionReturn(0);
2717 }
2718 
2719 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2720 {
2721   PetscErrorCode ierr;
2722 
2723   PetscFunctionBegin;
2724   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2725   PetscFunctionReturn(0);
2726 }
2727 
2728 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2729 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2730 {
2731   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2732   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2733   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2734   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2735   PetscErrorCode               ierr;
2736   cudaError_t                  cerr;
2737   cusparseStatus_t             stat;
2738   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2739   PetscBool                    compressed;
2740 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2741   PetscInt                     nx,ny;
2742 #endif
2743 
2744   PetscFunctionBegin;
2745   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2746   if (!a->nonzerorowcnt) {
2747     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2748     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2749     PetscFunctionReturn(0);
2750   }
2751   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2752   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2753   if (!trans) {
2754     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2755     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2756   } else {
2757     if (herm || !cusparsestruct->transgen) {
2758       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2759       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2760     } else {
2761       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2762       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2763     }
2764   }
2765   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2766   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2767 
2768   try {
2769     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2770     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2771     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2772 
2773     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2774     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2775       /* z = A x + beta y.
2776          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2777          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2778       */
2779       xptr = xarray;
2780       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2781       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2782      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2783       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2784           allocated to accommodate different uses. So we get the length info directly from mat.
2785        */
2786       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2787         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2788         nx = mat->num_cols;
2789         ny = mat->num_rows;
2790       }
2791      #endif
2792     } else {
2793       /* z = A^T x + beta y
2794          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2795          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2796        */
2797       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2798       dptr = zarray;
2799       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2800       if (compressed) { /* Scatter x to work vector */
2801         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2802         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2803                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2804                          VecCUDAEqualsReverse());
2805       }
2806      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2807       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2808         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2809         nx = mat->num_rows;
2810         ny = mat->num_cols;
2811       }
2812      #endif
2813     }
2814 
2815     /* csr_spmv does y = alpha op(A) x + beta y */
2816     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2818       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2819       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2820         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2821         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2822         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2823                                 matstruct->matDescr,
2824                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2825                                 matstruct->cuSpMV[opA].vecYDescr,
2826                                 cusparse_scalartype,
2827                                 cusparsestruct->spmvAlg,
2828                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2829         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2830 
2831         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2832       } else {
2833         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2834         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2835         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2836       }
2837 
2838       stat = cusparseSpMV(cusparsestruct->handle, opA,
2839                                matstruct->alpha_one,
2840                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2841                                matstruct->cuSpMV[opA].vecXDescr,
2842                                beta,
2843                                matstruct->cuSpMV[opA].vecYDescr,
2844                                cusparse_scalartype,
2845                                cusparsestruct->spmvAlg,
2846                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2847      #else
2848       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2849       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2850                                mat->num_rows, mat->num_cols,
2851                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2852                                mat->values->data().get(), mat->row_offsets->data().get(),
2853                                mat->column_indices->data().get(), xptr, beta,
2854                                dptr);CHKERRCUSPARSE(stat);
2855      #endif
2856     } else {
2857       if (cusparsestruct->nrows) {
2858        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2859         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2860        #else
2861         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2862         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2863                                  matstruct->alpha_one, matstruct->descr, hybMat,
2864                                  xptr, beta,
2865                                  dptr);CHKERRCUSPARSE(stat);
2866        #endif
2867       }
2868     }
2869     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2870     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2871 
2872     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2873       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2874         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2875           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2876         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2877           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2878         }
2879       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2880         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2881       }
2882 
2883       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2884       if (compressed) {
2885         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2886         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2887         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2888                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2889                          VecCUDAPlusEquals());
2890         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2891         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2892       }
2893     } else {
2894       if (yy && yy != zz) {
2895         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2896       }
2897     }
2898     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2899     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2900     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2901   } catch(char *ex) {
2902     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2903   }
2904   if (yy) {
2905     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2906   } else {
2907     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2908   }
2909   PetscFunctionReturn(0);
2910 }
2911 
2912 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2913 {
2914   PetscErrorCode ierr;
2915 
2916   PetscFunctionBegin;
2917   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2918   PetscFunctionReturn(0);
2919 }
2920 
2921 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2922 {
2923   PetscErrorCode              ierr;
2924   PetscSplitCSRDataStructure  *d_mat = NULL;
2925   PetscFunctionBegin;
2926   if (A->factortype == MAT_FACTOR_NONE) {
2927     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2928   }
2929   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2930   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2931   if (d_mat) {
2932     A->offloadmask = PETSC_OFFLOAD_GPU;
2933   }
2934 
2935   PetscFunctionReturn(0);
2936 }
2937 
2938 /* --------------------------------------------------------------------------------*/
2939 /*@
2940    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2941    (the default parallel PETSc format). This matrix will ultimately pushed down
2942    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2943    assembly performance the user should preallocate the matrix storage by setting
2944    the parameter nz (or the array nnz).  By setting these parameters accurately,
2945    performance during matrix assembly can be increased by more than a factor of 50.
2946 
2947    Collective
2948 
2949    Input Parameters:
2950 +  comm - MPI communicator, set to PETSC_COMM_SELF
2951 .  m - number of rows
2952 .  n - number of columns
2953 .  nz - number of nonzeros per row (same for all rows)
2954 -  nnz - array containing the number of nonzeros in the various rows
2955          (possibly different for each row) or NULL
2956 
2957    Output Parameter:
2958 .  A - the matrix
2959 
2960    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2961    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2962    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2963 
2964    Notes:
2965    If nnz is given then nz is ignored
2966 
2967    The AIJ format (also called the Yale sparse matrix format or
2968    compressed row storage), is fully compatible with standard Fortran 77
2969    storage.  That is, the stored row and column indices can begin at
2970    either one (as in Fortran) or zero.  See the users' manual for details.
2971 
2972    Specify the preallocated storage with either nz or nnz (not both).
2973    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2974    allocation.  For large problems you MUST preallocate memory or you
2975    will get TERRIBLE performance, see the users' manual chapter on matrices.
2976 
2977    By default, this format uses inodes (identical nodes) when possible, to
2978    improve numerical efficiency of matrix-vector products and solves. We
2979    search for consecutive rows with the same nonzero structure, thereby
2980    reusing matrix information to achieve increased efficiency.
2981 
2982    Level: intermediate
2983 
2984 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2985 @*/
2986 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2987 {
2988   PetscErrorCode ierr;
2989 
2990   PetscFunctionBegin;
2991   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2992   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2993   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2994   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2995   PetscFunctionReturn(0);
2996 }
2997 
2998 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2999 {
3000   PetscErrorCode              ierr;
3001   PetscSplitCSRDataStructure  *d_mat = NULL;
3002 
3003   PetscFunctionBegin;
3004   if (A->factortype == MAT_FACTOR_NONE) {
3005     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3006     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3007     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3008   } else {
3009     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3010   }
3011   if (d_mat) {
3012     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3013     cudaError_t                err;
3014     PetscSplitCSRDataStructure h_mat;
3015     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3016     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3017     if (a->compressedrow.use) {
3018       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3019     }
3020     err = cudaFree(d_mat);CHKERRCUDA(err);
3021   }
3022   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3023   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3024   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3025   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3026   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3027   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3028   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3029   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3030   PetscFunctionReturn(0);
3031 }
3032 
3033 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3034 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3035 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3036 {
3037   PetscErrorCode ierr;
3038 
3039   PetscFunctionBegin;
3040   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3041   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3042   PetscFunctionReturn(0);
3043 }
3044 
3045 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
3046 {
3047   PetscErrorCode     ierr;
3048   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3049   Mat_SeqAIJCUSPARSE *cy;
3050   Mat_SeqAIJCUSPARSE *cx;
3051   PetscScalar        *ay;
3052   const PetscScalar  *ax;
3053   CsrMatrix          *csry,*csrx;
3054   cudaError_t        cerr;
3055 
3056   PetscFunctionBegin;
3057   if (X->ops->axpy != Y->ops->axpy) {
3058     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3059     PetscFunctionReturn(0);
3060   }
3061   /* if we are here, it means both matrices are bound to GPU */
3062   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3063   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3064   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3065   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3066   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3067   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3068   csry = (CsrMatrix*)cy->mat->mat;
3069   csrx = (CsrMatrix*)cx->mat->mat;
3070   /* see if we can turn this into a cublas axpy */
3071   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3072     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3073     if (eq) {
3074       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3075     }
3076     if (eq) str = SAME_NONZERO_PATTERN;
3077   }
3078 
3079   if (str == SUBSET_NONZERO_PATTERN) {
3080     cusparseStatus_t stat;
3081     PetscScalar      b = 1.0;
3082 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3083     size_t           bufferSize;
3084     void             *buffer;
3085 #endif
3086 
3087     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3088     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3089     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3090 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3091     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3092                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3093                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3094                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3095     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3096     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3097     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3098                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3099                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3100                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3101     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3102     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3103     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3104     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3105 #else
3106     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3107     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3108                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3109                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3110                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3111     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3112     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3113     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3114 #endif
3115     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3116     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3117     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3118     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3119   } else if (str == SAME_NONZERO_PATTERN) {
3120     cublasHandle_t cublasv2handle;
3121     cublasStatus_t berr;
3122     PetscBLASInt   one = 1, bnz = 1;
3123 
3124     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3125     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3126     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3127     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3128     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3129     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3130     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3131     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3132     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3133     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3134     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3135     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3136   } else {
3137     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3138   }
3139   PetscFunctionReturn(0);
3140 }
3141 
3142 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3143 {
3144   PetscErrorCode             ierr;
3145   PetscBool                  both = PETSC_FALSE;
3146   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3147 
3148   PetscFunctionBegin;
3149   if (A->factortype == MAT_FACTOR_NONE) {
3150     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3151     if (spptr->mat) {
3152       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3153       if (matrix->values) {
3154         both = PETSC_TRUE;
3155         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3156       }
3157     }
3158     if (spptr->matTranspose) {
3159       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3160       if (matrix->values) {
3161         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3162       }
3163     }
3164   }
3165   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3166   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3167   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3168   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3169   else A->offloadmask = PETSC_OFFLOAD_CPU;
3170 
3171   PetscFunctionReturn(0);
3172 }
3173 
3174 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3175 {
3176   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3177   PetscErrorCode ierr;
3178 
3179   PetscFunctionBegin;
3180   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3181   if (flg) {
3182     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3183 
3184     A->ops->axpy                      = MatAXPY_SeqAIJ;
3185     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3186     A->ops->mult                      = MatMult_SeqAIJ;
3187     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3188     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3189     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3190     A->ops->multhermitiantranspose    = NULL;
3191     A->ops->multhermitiantransposeadd = NULL;
3192     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3193     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3194     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3195     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3196     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3197     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3198     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3199   } else {
3200     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3201     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3202     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3203     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3204     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3205     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3206     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3207     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3208     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3209     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3210     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3211     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3212     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3213     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3214     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3215   }
3216   A->boundtocpu = flg;
3217   a->inode.use = flg;
3218   PetscFunctionReturn(0);
3219 }
3220 
3221 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3222 {
3223   PetscErrorCode   ierr;
3224   cusparseStatus_t stat;
3225   Mat              B;
3226 
3227   PetscFunctionBegin;
3228   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3229   if (reuse == MAT_INITIAL_MATRIX) {
3230     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3231   } else if (reuse == MAT_REUSE_MATRIX) {
3232     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3233   }
3234   B = *newmat;
3235 
3236   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3237   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3238 
3239   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3240     if (B->factortype == MAT_FACTOR_NONE) {
3241       Mat_SeqAIJCUSPARSE *spptr;
3242 
3243       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3244       spptr->format = MAT_CUSPARSE_CSR;
3245       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3246       B->spptr = spptr;
3247       spptr->deviceMat = NULL;
3248     } else {
3249       Mat_SeqAIJCUSPARSETriFactors *spptr;
3250 
3251       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3252       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3253       B->spptr = spptr;
3254     }
3255     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3256   }
3257   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3258   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3259   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3260   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3261   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3262 
3263   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3264   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3265   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3266   PetscFunctionReturn(0);
3267 }
3268 
3269 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3270 {
3271   PetscErrorCode ierr;
3272 
3273   PetscFunctionBegin;
3274   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3275   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3276   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3277   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3278   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 /*MC
3283    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3284 
3285    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3286    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3287    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3288 
3289    Options Database Keys:
3290 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3291 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3292 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3293 
3294   Level: beginner
3295 
3296 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3297 M*/
3298 
3299 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3300 
3301 
3302 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3303 {
3304   PetscErrorCode ierr;
3305 
3306   PetscFunctionBegin;
3307   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3308   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3309   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3310   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3311   PetscFunctionReturn(0);
3312 }
3313 
3314 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3315 {
3316   PetscErrorCode   ierr;
3317   cusparseStatus_t stat;
3318 
3319   PetscFunctionBegin;
3320   if (*cusparsestruct) {
3321     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3322     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3323     delete (*cusparsestruct)->workVector;
3324     delete (*cusparsestruct)->rowoffsets_gpu;
3325     delete (*cusparsestruct)->cooPerm;
3326     delete (*cusparsestruct)->cooPerm_a;
3327     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3328    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3329     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3330    #endif
3331     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3332   }
3333   PetscFunctionReturn(0);
3334 }
3335 
3336 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3337 {
3338   PetscFunctionBegin;
3339   if (*mat) {
3340     delete (*mat)->values;
3341     delete (*mat)->column_indices;
3342     delete (*mat)->row_offsets;
3343     delete *mat;
3344     *mat = 0;
3345   }
3346   PetscFunctionReturn(0);
3347 }
3348 
3349 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3350 {
3351   cusparseStatus_t stat;
3352   PetscErrorCode   ierr;
3353 
3354   PetscFunctionBegin;
3355   if (*trifactor) {
3356     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3357     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3358     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3359     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3360     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3361    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3362     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3363    #endif
3364     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3365   }
3366   PetscFunctionReturn(0);
3367 }
3368 
3369 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3370 {
3371   CsrMatrix        *mat;
3372   cusparseStatus_t stat;
3373   cudaError_t      err;
3374 
3375   PetscFunctionBegin;
3376   if (*matstruct) {
3377     if ((*matstruct)->mat) {
3378       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3379        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3380         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3381        #else
3382         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3383         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3384        #endif
3385       } else {
3386         mat = (CsrMatrix*)(*matstruct)->mat;
3387         CsrMatrix_Destroy(&mat);
3388       }
3389     }
3390     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3391     delete (*matstruct)->cprowIndices;
3392     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3393     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3394     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3395 
3396    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3397     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3398     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3399     for (int i=0; i<3; i++) {
3400       if (mdata->cuSpMV[i].initialized) {
3401         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3402         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3403         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3404       }
3405     }
3406    #endif
3407     delete *matstruct;
3408     *matstruct = NULL;
3409   }
3410   PetscFunctionReturn(0);
3411 }
3412 
3413 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3414 {
3415   PetscErrorCode ierr;
3416 
3417   PetscFunctionBegin;
3418   if (*trifactors) {
3419     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3420     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3421     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3422     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3423     delete (*trifactors)->rpermIndices;
3424     delete (*trifactors)->cpermIndices;
3425     delete (*trifactors)->workVector;
3426     (*trifactors)->rpermIndices = NULL;
3427     (*trifactors)->cpermIndices = NULL;
3428     (*trifactors)->workVector = NULL;
3429   }
3430   PetscFunctionReturn(0);
3431 }
3432 
3433 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3434 {
3435   PetscErrorCode   ierr;
3436   cusparseHandle_t handle;
3437   cusparseStatus_t stat;
3438 
3439   PetscFunctionBegin;
3440   if (*trifactors) {
3441     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3442     if (handle = (*trifactors)->handle) {
3443       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3444     }
3445     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3446   }
3447   PetscFunctionReturn(0);
3448 }
3449 
3450 struct IJCompare
3451 {
3452   __host__ __device__
3453   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3454   {
3455     if (t1.get<0>() < t2.get<0>()) return true;
3456     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3457     return false;
3458   }
3459 };
3460 
3461 struct IJEqual
3462 {
3463   __host__ __device__
3464   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3465   {
3466     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3467     return true;
3468   }
3469 };
3470 
3471 struct IJDiff
3472 {
3473   __host__ __device__
3474   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3475   {
3476     return t1 == t2 ? 0 : 1;
3477   }
3478 };
3479 
3480 struct IJSum
3481 {
3482   __host__ __device__
3483   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3484   {
3485     return t1||t2;
3486   }
3487 };
3488 
3489 #include <thrust/iterator/discard_iterator.h>
3490 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3491 {
3492   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3493   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3494   THRUSTARRAY                           *cooPerm_v = NULL;
3495   thrust::device_ptr<const PetscScalar> d_v;
3496   CsrMatrix                             *matrix;
3497   PetscErrorCode                        ierr;
3498   cudaError_t                           cerr;
3499   PetscInt                              n;
3500 
3501   PetscFunctionBegin;
3502   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3503   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3504   if (!cusp->cooPerm) {
3505     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3506     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3507     PetscFunctionReturn(0);
3508   }
3509   matrix = (CsrMatrix*)cusp->mat->mat;
3510   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3511   if (!v) {
3512     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3513     goto finalize;
3514   }
3515   n = cusp->cooPerm->size();
3516   if (isCudaMem(v)) {
3517     d_v = thrust::device_pointer_cast(v);
3518   } else {
3519     cooPerm_v = new THRUSTARRAY(n);
3520     cooPerm_v->assign(v,v+n);
3521     d_v = cooPerm_v->data();
3522     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3523   }
3524   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3525   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3526     if (cusp->cooPerm_a) {
3527       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
3528       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3529       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3530       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3531       delete cooPerm_w;
3532     } else {
3533       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3534                                                                 matrix->values->begin()));
3535       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3536                                                                 matrix->values->end()));
3537       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3538     }
3539   } else {
3540     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3541       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3542       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3543     } else {
3544       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3545                                                                 matrix->values->begin()));
3546       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3547                                                                 matrix->values->end()));
3548       thrust::for_each(zibit,zieit,VecCUDAEquals());
3549     }
3550   }
3551   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3552   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3553 finalize:
3554   delete cooPerm_v;
3555   A->offloadmask = PETSC_OFFLOAD_GPU;
3556   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3557   /* shorter version of MatAssemblyEnd_SeqAIJ */
3558   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3559   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3560   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3561   a->reallocs         = 0;
3562   A->info.mallocs    += 0;
3563   A->info.nz_unneeded = 0;
3564   A->assembled = A->was_assembled = PETSC_TRUE;
3565   A->num_ass++;
3566   PetscFunctionReturn(0);
3567 }
3568 
3569 #include <thrust/binary_search.h>
3570 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3571 {
3572   PetscErrorCode     ierr;
3573   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3574   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3575   PetscInt           cooPerm_n, nzr = 0;
3576   cudaError_t        cerr;
3577 
3578   PetscFunctionBegin;
3579   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3580   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3581   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3582   if (n != cooPerm_n) {
3583     delete cusp->cooPerm;
3584     delete cusp->cooPerm_a;
3585     cusp->cooPerm = NULL;
3586     cusp->cooPerm_a = NULL;
3587   }
3588   if (n) {
3589     THRUSTINTARRAY d_i(n);
3590     THRUSTINTARRAY d_j(n);
3591     THRUSTINTARRAY ii(A->rmap->n);
3592 
3593     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3594     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3595 
3596     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3597     d_i.assign(coo_i,coo_i+n);
3598     d_j.assign(coo_j,coo_j+n);
3599     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3600     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3601 
3602     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3603     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3604     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3605     *cusp->cooPerm_a = d_i;
3606     THRUSTINTARRAY w = d_j;
3607 
3608     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3609     if (nekey == ekey) { /* all entries are unique */
3610       delete cusp->cooPerm_a;
3611       cusp->cooPerm_a = NULL;
3612     } else { /* I couldn't come up with a more elegant algorithm */
3613       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3614       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3615       (*cusp->cooPerm_a)[0] = 0;
3616       w[0] = 0;
3617       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3618       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3619     }
3620     thrust::counting_iterator<PetscInt> search_begin(0);
3621     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3622                         search_begin, search_begin + A->rmap->n,
3623                         ii.begin());
3624     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3625     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3626 
3627     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3628     a->singlemalloc = PETSC_FALSE;
3629     a->free_a       = PETSC_TRUE;
3630     a->free_ij      = PETSC_TRUE;
3631     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3632     a->i[0] = 0;
3633     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3634     a->nz = a->maxnz = a->i[A->rmap->n];
3635     a->rmax = 0;
3636     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3637     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3638     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3639     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3640     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3641     for (PetscInt i = 0; i < A->rmap->n; i++) {
3642       const PetscInt nnzr = a->i[i+1] - a->i[i];
3643       nzr += (PetscInt)!!(nnzr);
3644       a->ilen[i] = a->imax[i] = nnzr;
3645       a->rmax = PetscMax(a->rmax,nnzr);
3646     }
3647     a->nonzerorowcnt = nzr;
3648     A->preallocated = PETSC_TRUE;
3649     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3650     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3651   } else {
3652     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3653   }
3654   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3655 
3656   /* We want to allocate the CUSPARSE struct for matvec now.
3657      The code is so convoluted now that I prefer to copy zeros */
3658   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3659   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3660   A->offloadmask = PETSC_OFFLOAD_CPU;
3661   A->nonzerostate++;
3662   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3663   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3664 
3665   A->assembled = PETSC_FALSE;
3666   A->was_assembled = PETSC_FALSE;
3667   PetscFunctionReturn(0);
3668 }
3669 
3670 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3671 {
3672   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3673   CsrMatrix          *csr;
3674   PetscErrorCode     ierr;
3675 
3676   PetscFunctionBegin;
3677   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3678   PetscValidPointer(a,2);
3679   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3680   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3681   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3682   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3683   csr = (CsrMatrix*)cusp->mat->mat;
3684   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3685   *a = csr->values->data().get();
3686   PetscFunctionReturn(0);
3687 }
3688 
3689 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3690 {
3691   PetscFunctionBegin;
3692   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3693   PetscValidPointer(a,2);
3694   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3695   *a = NULL;
3696   PetscFunctionReturn(0);
3697 }
3698 
3699 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3700 {
3701   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3702   CsrMatrix          *csr;
3703   PetscErrorCode     ierr;
3704 
3705   PetscFunctionBegin;
3706   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3707   PetscValidPointer(a,2);
3708   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3709   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3710   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3711   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3712   csr = (CsrMatrix*)cusp->mat->mat;
3713   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3714   *a = csr->values->data().get();
3715   A->offloadmask = PETSC_OFFLOAD_GPU;
3716   PetscFunctionReturn(0);
3717 }
3718 
3719 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3720 {
3721   PetscErrorCode ierr;
3722 
3723   PetscFunctionBegin;
3724   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3725   PetscValidPointer(a,2);
3726   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3727   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3728   *a = NULL;
3729   PetscFunctionReturn(0);
3730 }
3731 
3732 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3733 {
3734   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3735   CsrMatrix          *csr;
3736 
3737   PetscFunctionBegin;
3738   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3739   PetscValidPointer(a,2);
3740   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3741   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3742   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3743   csr = (CsrMatrix*)cusp->mat->mat;
3744   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3745   *a = csr->values->data().get();
3746   A->offloadmask = PETSC_OFFLOAD_GPU;
3747   PetscFunctionReturn(0);
3748 }
3749 
3750 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3751 {
3752   PetscErrorCode ierr;
3753 
3754   PetscFunctionBegin;
3755   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3756   PetscValidPointer(a,2);
3757   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3758   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3759   *a = NULL;
3760   PetscFunctionReturn(0);
3761 }
3762 
3763 struct IJCompare4
3764 {
3765   __host__ __device__
3766   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3767   {
3768     if (t1.get<0>() < t2.get<0>()) return true;
3769     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3770     return false;
3771   }
3772 };
3773 
3774 struct Shift
3775 {
3776   int _shift;
3777 
3778   Shift(int shift) : _shift(shift) {}
3779   __host__ __device__
3780   inline int operator() (const int &c)
3781   {
3782     return c + _shift;
3783   }
3784 };
3785 
3786 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3787 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3788 {
3789   PetscErrorCode               ierr;
3790   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3791   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3792   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3793   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3794   PetscInt                     Annz,Bnnz;
3795   cusparseStatus_t             stat;
3796   PetscInt                     i,m,n,zero = 0;
3797   cudaError_t                  cerr;
3798 
3799   PetscFunctionBegin;
3800   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3802   PetscValidPointer(C,4);
3803   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3804   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3805   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3806   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3807   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3808   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3809   if (reuse == MAT_INITIAL_MATRIX) {
3810     m     = A->rmap->n;
3811     n     = A->cmap->n + B->cmap->n;
3812     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3813     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3814     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3815     c     = (Mat_SeqAIJ*)(*C)->data;
3816     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3817     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3818     Ccsr  = new CsrMatrix;
3819     Cmat->cprowIndices      = NULL;
3820     c->compressedrow.use    = PETSC_FALSE;
3821     c->compressedrow.nrows  = 0;
3822     c->compressedrow.i      = NULL;
3823     c->compressedrow.rindex = NULL;
3824     Ccusp->workVector       = NULL;
3825     Ccusp->nrows    = m;
3826     Ccusp->mat      = Cmat;
3827     Ccusp->mat->mat = Ccsr;
3828     Ccsr->num_rows  = m;
3829     Ccsr->num_cols  = n;
3830     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3831     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3832     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3833     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3834     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3835     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3836     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3837     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3838     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3839     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3840     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3841     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3842     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3843     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3844     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3845 
3846     Acsr = (CsrMatrix*)Acusp->mat->mat;
3847     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3848     Annz = (PetscInt)Acsr->column_indices->size();
3849     Bnnz = (PetscInt)Bcsr->column_indices->size();
3850     c->nz = Annz + Bnnz;
3851     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3852     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3853     Ccsr->values = new THRUSTARRAY(c->nz);
3854     Ccsr->num_entries = c->nz;
3855     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3856     if (c->nz) {
3857       auto Acoo = new THRUSTINTARRAY32(Annz);
3858       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
3859       auto Ccoo = new THRUSTINTARRAY32(c->nz);
3860       THRUSTINTARRAY32 *Aroff,*Broff;
3861 
3862       if (a->compressedrow.use) { /* need full row offset */
3863         if (!Acusp->rowoffsets_gpu) {
3864           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3865           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3866           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3867         }
3868         Aroff = Acusp->rowoffsets_gpu;
3869       } else Aroff = Acsr->row_offsets;
3870       if (b->compressedrow.use) { /* need full row offset */
3871         if (!Bcusp->rowoffsets_gpu) {
3872           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3873           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3874           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3875         }
3876         Broff = Bcusp->rowoffsets_gpu;
3877       } else Broff = Bcsr->row_offsets;
3878       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3879       stat = cusparseXcsr2coo(Acusp->handle,
3880                               Aroff->data().get(),
3881                               Annz,
3882                               m,
3883                               Acoo->data().get(),
3884                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3885       stat = cusparseXcsr2coo(Bcusp->handle,
3886                               Broff->data().get(),
3887                               Bnnz,
3888                               m,
3889                               Bcoo->data().get(),
3890                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3891       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
3892       auto Aperm = thrust::make_constant_iterator(1);
3893       auto Bperm = thrust::make_constant_iterator(0);
3894 #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3895       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3896       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3897 #else
3898       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
3899       auto Bcib = Bcsr->column_indices->begin();
3900       auto Bcie = Bcsr->column_indices->end();
3901       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
3902 #endif
3903       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
3904       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3905       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3906       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
3907       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
3908       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3909       auto p1 = Ccusp->cooPerm->begin();
3910       auto p2 = Ccusp->cooPerm->begin();
3911       thrust::advance(p2,Annz);
3912       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
3913 #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
3914       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
3915 #endif
3916       auto cci = thrust::make_counting_iterator(zero);
3917       auto cce = thrust::make_counting_iterator(c->nz);
3918 #if 0 //Errors on SUMMIT cuda 11.1.0
3919       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
3920 #else
3921       auto pred = thrust::identity<int>();
3922       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
3923       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
3924 #endif
3925       stat = cusparseXcoo2csr(Ccusp->handle,
3926                               Ccoo->data().get(),
3927                               c->nz,
3928                               m,
3929                               Ccsr->row_offsets->data().get(),
3930                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3931       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3932       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3933       delete wPerm;
3934       delete Acoo;
3935       delete Bcoo;
3936       delete Ccoo;
3937 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3938       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3939                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3940                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3941                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3942 #endif
3943       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3944         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3945         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3946         CsrMatrix *CcsrT = new CsrMatrix;
3947         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3948         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3949 
3950         Ccusp->transgen = PETSC_TRUE;
3951         CmatT->cprowIndices  = NULL;
3952         CmatT->mat = CcsrT;
3953         CcsrT->num_rows = n;
3954         CcsrT->num_cols = m;
3955         CcsrT->num_entries = c->nz;
3956 
3957         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3958         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3959         CcsrT->values = new THRUSTARRAY(c->nz);
3960 
3961         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3962         auto rT = CcsrT->row_offsets->begin();
3963         if (AT) {
3964           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3965           thrust::advance(rT,-1);
3966         }
3967         if (BT) {
3968           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3969           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3970           thrust::copy(titb,tite,rT);
3971         }
3972         auto cT = CcsrT->column_indices->begin();
3973         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3974         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3975         auto vT = CcsrT->values->begin();
3976         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3977         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3978         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3979         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3980 
3981         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3982         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3984         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3985         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3986         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3987         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3988         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3989         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3990 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3991         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3992                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3993                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3994                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3995 #endif
3996         Ccusp->matTranspose = CmatT;
3997       }
3998     }
3999 
4000     c->singlemalloc = PETSC_FALSE;
4001     c->free_a       = PETSC_TRUE;
4002     c->free_ij      = PETSC_TRUE;
4003     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4004     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4005     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4006       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4007       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4008       ii   = *Ccsr->row_offsets;
4009       jj   = *Ccsr->column_indices;
4010       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4011       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4012     } else {
4013       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4014       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4015     }
4016     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4017     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4018     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4019     c->maxnz = c->nz;
4020     c->nonzerorowcnt = 0;
4021     c->rmax = 0;
4022     for (i = 0; i < m; i++) {
4023       const PetscInt nn = c->i[i+1] - c->i[i];
4024       c->ilen[i] = c->imax[i] = nn;
4025       c->nonzerorowcnt += (PetscInt)!!nn;
4026       c->rmax = PetscMax(c->rmax,nn);
4027     }
4028     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4029     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4030     (*C)->nonzerostate++;
4031     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4032     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4033     Ccusp->nonzerostate = (*C)->nonzerostate;
4034     (*C)->preallocated  = PETSC_TRUE;
4035   } else {
4036     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4037     c = (Mat_SeqAIJ*)(*C)->data;
4038     if (c->nz) {
4039       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4040       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4041       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4042       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4043       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4044       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4045       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4046       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4047       Acsr = (CsrMatrix*)Acusp->mat->mat;
4048       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4049       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4050       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4051       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4052       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4053       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4054       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4055       auto pmid = Ccusp->cooPerm->begin();
4056       thrust::advance(pmid,Acsr->num_entries);
4057       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4058       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4059                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4060       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4061                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4062       thrust::for_each(zibait,zieait,VecCUDAEquals());
4063       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4064                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4065       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4066                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4067       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4068       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4069         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4070         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4071         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4072         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4073         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4074         auto vT = CcsrT->values->begin();
4075         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4076         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4077       }
4078       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4079       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4080     }
4081   }
4082   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4083   (*C)->assembled     = PETSC_TRUE;
4084   (*C)->was_assembled = PETSC_FALSE;
4085   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4086   PetscFunctionReturn(0);
4087 }
4088