xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision e589036e9683f8580551b8f54d59a5f73edfef1f)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
90 {
91   cusparseStatus_t   stat;
92   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
93 
94   PetscFunctionBegin;
95   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
96   cusparsestruct->stream = stream;
97   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
98   PetscFunctionReturn(0);
99 }
100 
101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
102 {
103   cusparseStatus_t   stat;
104   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
105 
106   PetscFunctionBegin;
107   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
108   if (cusparsestruct->handle != handle) {
109     if (cusparsestruct->handle) {
110       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
111     }
112     cusparsestruct->handle = handle;
113   }
114   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
119 {
120   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
121   PetscBool          flg;
122   PetscErrorCode     ierr;
123 
124   PetscFunctionBegin;
125   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
126   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
127   if (cusparsestruct->handle) cusparsestruct->handle = 0;
128   PetscFunctionReturn(0);
129 }
130 
131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
132 {
133   PetscFunctionBegin;
134   *type = MATSOLVERCUSPARSE;
135   PetscFunctionReturn(0);
136 }
137 
138 /*MC
139   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
140   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
141   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
142   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
143   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
144   algorithms are not recommended. This class does NOT support direct solver operations.
145 
146   Level: beginner
147 
148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
149 M*/
150 
151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
152 {
153   PetscErrorCode ierr;
154   PetscInt       n = A->rmap->n;
155 
156   PetscFunctionBegin;
157   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
158   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
159   (*B)->factortype = ftype;
160   (*B)->useordering = PETSC_TRUE;
161   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
162 
163   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
164     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
165     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
167   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
168     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
170   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
171 
172   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
173   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
178 {
179   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
180 
181   PetscFunctionBegin;
182   switch (op) {
183   case MAT_CUSPARSE_MULT:
184     cusparsestruct->format = format;
185     break;
186   case MAT_CUSPARSE_ALL:
187     cusparsestruct->format = format;
188     break;
189   default:
190     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*@
196    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
197    operation. Only the MatMult operation can use different GPU storage formats
198    for MPIAIJCUSPARSE matrices.
199    Not Collective
200 
201    Input Parameters:
202 +  A - Matrix of type SEQAIJCUSPARSE
203 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
204 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
205 
206    Output Parameter:
207 
208    Level: intermediate
209 
210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
211 @*/
212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
218   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 /*@
223    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
224 
225    Collective on mat
226 
227    Input Parameters:
228 +  A - Matrix of type SEQAIJCUSPARSE
229 -  transgen - the boolean flag
230 
231    Level: intermediate
232 
233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
234 @*/
235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
236 {
237   PetscErrorCode ierr;
238   PetscBool      flg;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
243   if (flg) {
244     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
245 
246     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
247     cusp->transgen = transgen;
248     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
249       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
250     }
251   }
252   PetscFunctionReturn(0);
253 }
254 
255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
256 {
257   PetscErrorCode           ierr;
258   MatCUSPARSEStorageFormat format;
259   PetscBool                flg;
260   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
261 
262   PetscFunctionBegin;
263   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
264   if (A->factortype == MAT_FACTOR_NONE) {
265     PetscBool transgen = cusparsestruct->transgen;
266 
267     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
273 
274     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
275                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
277    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
278     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
279     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
280                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
281     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
282     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
283 
284     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
285     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
286                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
287     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
288 
289     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
290     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
291                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
292     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
293    #endif
294   }
295   ierr = PetscOptionsTail();CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
302   PetscErrorCode               ierr;
303 
304   PetscFunctionBegin;
305   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
306   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
307   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
308   PetscFunctionReturn(0);
309 }
310 
311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
312 {
313   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
314   PetscErrorCode               ierr;
315 
316   PetscFunctionBegin;
317   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
318   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
319   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
324 {
325   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326   PetscErrorCode               ierr;
327 
328   PetscFunctionBegin;
329   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
336 {
337   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338   PetscErrorCode               ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
343   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
344   PetscFunctionReturn(0);
345 }
346 
347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
348 {
349   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
350   PetscInt                          n = A->rmap->n;
351   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
353   cusparseStatus_t                  stat;
354   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
355   const MatScalar                   *aa = a->a,*v;
356   PetscInt                          *AiLo, *AjLo;
357   PetscInt                          i,nz, nzLower, offset, rowOffset;
358   PetscErrorCode                    ierr;
359   cudaError_t                       cerr;
360 
361   PetscFunctionBegin;
362   if (!n) PetscFunctionReturn(0);
363   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
364     try {
365       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
366       nzLower=n+ai[n]-ai[1];
367       if (!loTriFactor) {
368         PetscScalar                       *AALo;
369 
370         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
371 
372         /* Allocate Space for the lower triangular matrix */
373         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
374         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
375 
376         /* Fill the lower triangular matrix */
377         AiLo[0]  = (PetscInt) 0;
378         AiLo[n]  = nzLower;
379         AjLo[0]  = (PetscInt) 0;
380         AALo[0]  = (MatScalar) 1.0;
381         v        = aa;
382         vi       = aj;
383         offset   = 1;
384         rowOffset= 1;
385         for (i=1; i<n; i++) {
386           nz = ai[i+1] - ai[i];
387           /* additional 1 for the term on the diagonal */
388           AiLo[i]    = rowOffset;
389           rowOffset += nz+1;
390 
391           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
392           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
393 
394           offset      += nz;
395           AjLo[offset] = (PetscInt) i;
396           AALo[offset] = (MatScalar) 1.0;
397           offset      += 1;
398 
399           v  += nz;
400           vi += nz;
401         }
402 
403         /* allocate space for the triangular factor information */
404         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
405         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
406         /* Create the matrix description */
407         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
408         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
409        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
410         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
411        #else
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
413        #endif
414         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
415         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
416 
417         /* set the operation */
418         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
419 
420         /* set the matrix */
421         loTriFactor->csrMat = new CsrMatrix;
422         loTriFactor->csrMat->num_rows = n;
423         loTriFactor->csrMat->num_cols = n;
424         loTriFactor->csrMat->num_entries = nzLower;
425 
426         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
427         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
428 
429         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
430         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
431 
432         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
433         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
434 
435         /* Create the solve analysis information */
436         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
437         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
438       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
439         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
440                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
441                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
442                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
443                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
444         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
445       #endif
446 
447         /* perform the solve analysis */
448         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
449                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
450                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
451                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
452                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
453                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
454                                #endif
455 );CHKERRCUSPARSE(stat);
456         cerr = WaitForCUDA();CHKERRCUDA(cerr);
457         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458 
459         /* assign the pointer */
460         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
461         loTriFactor->AA_h = AALo;
462         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
463         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
464         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
465       } else { /* update values only */
466         if (!loTriFactor->AA_h) {
467           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
468         }
469         /* Fill the lower triangular matrix */
470         loTriFactor->AA_h[0]  = 1.0;
471         v        = aa;
472         vi       = aj;
473         offset   = 1;
474         for (i=1; i<n; i++) {
475           nz = ai[i+1] - ai[i];
476           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
477           offset      += nz;
478           loTriFactor->AA_h[offset] = 1.0;
479           offset      += 1;
480           v  += nz;
481         }
482         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
483         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       }
485     } catch(char *ex) {
486       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
487     }
488   }
489   PetscFunctionReturn(0);
490 }
491 
492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
493 {
494   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
495   PetscInt                          n = A->rmap->n;
496   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
497   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
498   cusparseStatus_t                  stat;
499   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
500   const MatScalar                   *aa = a->a,*v;
501   PetscInt                          *AiUp, *AjUp;
502   PetscInt                          i,nz, nzUpper, offset;
503   PetscErrorCode                    ierr;
504   cudaError_t                       cerr;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* next, figure out the number of nonzeros in the upper triangular matrix. */
511       nzUpper = adiag[0]-adiag[n];
512       if (!upTriFactor) {
513         PetscScalar *AAUp;
514 
515         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
516 
517         /* Allocate Space for the upper triangular matrix */
518         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
519         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
520 
521         /* Fill the upper triangular matrix */
522         AiUp[0]=(PetscInt) 0;
523         AiUp[n]=nzUpper;
524         offset = nzUpper;
525         for (i=n-1; i>=0; i--) {
526           v  = aa + adiag[i+1] + 1;
527           vi = aj + adiag[i+1] + 1;
528 
529           /* number of elements NOT on the diagonal */
530           nz = adiag[i] - adiag[i+1]-1;
531 
532           /* decrement the offset */
533           offset -= (nz+1);
534 
535           /* first, set the diagonal elements */
536           AjUp[offset] = (PetscInt) i;
537           AAUp[offset] = (MatScalar)1./v[nz];
538           AiUp[i]      = AiUp[i+1] - (nz+1);
539 
540           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
541           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
542         }
543 
544         /* allocate space for the triangular factor information */
545         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
546         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
547 
548         /* Create the matrix description */
549         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
550         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
551        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
552         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
553        #else
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
555        #endif
556         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
557         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
558 
559         /* set the operation */
560         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
561 
562         /* set the matrix */
563         upTriFactor->csrMat = new CsrMatrix;
564         upTriFactor->csrMat->num_rows = n;
565         upTriFactor->csrMat->num_cols = n;
566         upTriFactor->csrMat->num_entries = nzUpper;
567 
568         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
569         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
570 
571         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
572         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
573 
574         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
575         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
576 
577         /* Create the solve analysis information */
578         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
579         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
581         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
582                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
585                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
586         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
587       #endif
588 
589         /* perform the solve analysis */
590         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
594                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
595                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
596                                #endif
597 );CHKERRCUSPARSE(stat);
598         cerr = WaitForCUDA();CHKERRCUDA(cerr);
599         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
600 
601         /* assign the pointer */
602         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
603         upTriFactor->AA_h = AAUp;
604         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
605         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
606         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
607       } else {
608         if (!upTriFactor->AA_h) {
609           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
610         }
611         /* Fill the upper triangular matrix */
612         offset = nzUpper;
613         for (i=n-1; i>=0; i--) {
614           v  = aa + adiag[i+1] + 1;
615 
616           /* number of elements NOT on the diagonal */
617           nz = adiag[i] - adiag[i+1]-1;
618 
619           /* decrement the offset */
620           offset -= (nz+1);
621 
622           /* first, set the diagonal elements */
623           upTriFactor->AA_h[offset] = 1./v[nz];
624           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
625         }
626         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
627         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
628       }
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           isrow = a->row,iscol = a->icol;
642   PetscBool                    row_identity,col_identity;
643   PetscInt                     n = A->rmap->n;
644 
645   PetscFunctionBegin;
646   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
647   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
648   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
649 
650   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
651   cusparseTriFactors->nnz=a->nz;
652 
653   A->offloadmask = PETSC_OFFLOAD_BOTH;
654   /* lower triangular indices */
655   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
656   if (!row_identity && !cusparseTriFactors->rpermIndices) {
657     const PetscInt *r;
658 
659     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(r, r+n);
662     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665 
666   /* upper triangular indices */
667   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
668   if (!col_identity && !cusparseTriFactors->cpermIndices) {
669     const PetscInt *c;
670 
671     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
672     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->cpermIndices->assign(c, c+n);
674     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
675     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
681 {
682   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
683   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
685   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
686   cusparseStatus_t                  stat;
687   PetscErrorCode                    ierr;
688   cudaError_t                       cerr;
689   PetscInt                          *AiUp, *AjUp;
690   PetscScalar                       *AAUp;
691   PetscScalar                       *AALo;
692   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
693   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
694   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
695   const MatScalar                   *aa = b->a,*v;
696 
697   PetscFunctionBegin;
698   if (!n) PetscFunctionReturn(0);
699   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
700     try {
701       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
702       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
703       if (!upTriFactor && !loTriFactor) {
704         /* Allocate Space for the upper triangular matrix */
705         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
706         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
707 
708         /* Fill the upper triangular matrix */
709         AiUp[0]=(PetscInt) 0;
710         AiUp[n]=nzUpper;
711         offset = 0;
712         for (i=0; i<n; i++) {
713           /* set the pointers */
714           v  = aa + ai[i];
715           vj = aj + ai[i];
716           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
717 
718           /* first, set the diagonal elements */
719           AjUp[offset] = (PetscInt) i;
720           AAUp[offset] = (MatScalar)1.0/v[nz];
721           AiUp[i]      = offset;
722           AALo[offset] = (MatScalar)1.0/v[nz];
723 
724           offset+=1;
725           if (nz>0) {
726             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
727             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
728             for (j=offset; j<offset+nz; j++) {
729               AAUp[j] = -AAUp[j];
730               AALo[j] = AAUp[j]/v[nz];
731             }
732             offset+=nz;
733           }
734         }
735 
736         /* allocate space for the triangular factor information */
737         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
738         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
739 
740         /* Create the matrix description */
741         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
742         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
743        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
744         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
745        #else
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
747        #endif
748         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
749         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
750 
751         /* set the matrix */
752         upTriFactor->csrMat = new CsrMatrix;
753         upTriFactor->csrMat->num_rows = A->rmap->n;
754         upTriFactor->csrMat->num_cols = A->cmap->n;
755         upTriFactor->csrMat->num_entries = a->nz;
756 
757         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
758         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
759 
760         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
761         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
762 
763         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
764         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
765 
766         /* set the operation */
767         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769         /* Create the solve analysis information */
770         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
771         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
772       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
773         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
774                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
775                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
776                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
777                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
778         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
779       #endif
780 
781         /* perform the solve analysis */
782         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
783                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
784                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
785                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
786                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
787                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
788                                 #endif
789 );CHKERRCUSPARSE(stat);
790         cerr = WaitForCUDA();CHKERRCUDA(cerr);
791         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
792 
793         /* assign the pointer */
794         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
795 
796         /* allocate space for the triangular factor information */
797         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
798         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
802         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
805        #else
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
807        #endif
808         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
809         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
810 
811         /* set the operation */
812         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
813 
814         /* set the matrix */
815         loTriFactor->csrMat = new CsrMatrix;
816         loTriFactor->csrMat->num_rows = A->rmap->n;
817         loTriFactor->csrMat->num_cols = A->cmap->n;
818         loTriFactor->csrMat->num_entries = a->nz;
819 
820         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
821         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
822 
823         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
824         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
825 
826         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
827         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
828 
829         /* Create the solve analysis information */
830         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
831         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
834                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
835                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
837                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
838         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
839       #endif
840 
841         /* perform the solve analysis */
842         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
843                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
844                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
845                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
846                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
848                                 #endif
849 );CHKERRCUSPARSE(stat);
850         cerr = WaitForCUDA();CHKERRCUDA(cerr);
851         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
852 
853         /* assign the pointer */
854         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
855 
856         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
857         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
858         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
859       } else {
860         /* Fill the upper triangular matrix */
861         offset = 0;
862         for (i=0; i<n; i++) {
863           /* set the pointers */
864           v  = aa + ai[i];
865           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
866 
867           /* first, set the diagonal elements */
868           AAUp[offset] = 1.0/v[nz];
869           AALo[offset] = 1.0/v[nz];
870 
871           offset+=1;
872           if (nz>0) {
873             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
874             for (j=offset; j<offset+nz; j++) {
875               AAUp[j] = -AAUp[j];
876               AALo[j] = AAUp[j]/v[nz];
877             }
878             offset+=nz;
879           }
880         }
881         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
882         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
883         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
884         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
885         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
886       }
887       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
888       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
889     } catch(char *ex) {
890       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
891     }
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
897 {
898   PetscErrorCode               ierr;
899   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
900   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
901   IS                           ip = a->row;
902   PetscBool                    perm_identity;
903   PetscInt                     n = A->rmap->n;
904 
905   PetscFunctionBegin;
906   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
908   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
909   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
910 
911   A->offloadmask = PETSC_OFFLOAD_BOTH;
912 
913   /* lower triangular indices */
914   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
915   if (!perm_identity) {
916     IS             iip;
917     const PetscInt *irip,*rip;
918 
919     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
920     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
921     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
922     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
923     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
924     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
926     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
927     ierr = ISDestroy(&iip);CHKERRQ(ierr);
928     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
929     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
935 {
936   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
937   IS             isrow = b->row,iscol = b->col;
938   PetscBool      row_identity,col_identity;
939   PetscErrorCode ierr;
940 
941   PetscFunctionBegin;
942   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
943   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
944   B->offloadmask = PETSC_OFFLOAD_CPU;
945   /* determine which version of MatSolve needs to be used. */
946   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
947   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
948   if (row_identity && col_identity) {
949     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
950     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
951     B->ops->matsolve = NULL;
952     B->ops->matsolvetranspose = NULL;
953   } else {
954     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
955     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
956     B->ops->matsolve = NULL;
957     B->ops->matsolvetranspose = NULL;
958   }
959 
960   /* get the triangular factors */
961   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
966 {
967   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
968   IS             ip = b->row;
969   PetscBool      perm_identity;
970   PetscErrorCode ierr;
971 
972   PetscFunctionBegin;
973   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
974   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
975   B->offloadmask = PETSC_OFFLOAD_CPU;
976   /* determine which version of MatSolve needs to be used. */
977   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
978   if (perm_identity) {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   } else {
984     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
985     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
986     B->ops->matsolve = NULL;
987     B->ops->matsolvetranspose = NULL;
988   }
989 
990   /* get the triangular factors */
991   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
992   PetscFunctionReturn(0);
993 }
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
996 {
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1002   cusparseStatus_t                  stat;
1003   cusparseIndexBase_t               indexBase;
1004   cusparseMatrixType_t              matrixType;
1005   cusparseFillMode_t                fillMode;
1006   cusparseDiagType_t                diagType;
1007   cudaError_t                       cerr;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* allocate space for the transpose of the lower triangular factor */
1012   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1013   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1014 
1015   /* set the matrix descriptors of the lower triangular factor */
1016   matrixType = cusparseGetMatType(loTriFactor->descr);
1017   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1018   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1019     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1020   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1021 
1022   /* Create the matrix description */
1023   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1028 
1029   /* set the operation */
1030   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1031 
1032   /* allocate GPU space for the CSC of the lower triangular factor*/
1033   loTriFactorT->csrMat = new CsrMatrix;
1034   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1035   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1036   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1037   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1038   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1039   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1040 
1041   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1043   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                                        loTriFactor->csrMat->values->data().get(),
1046                                        loTriFactor->csrMat->row_offsets->data().get(),
1047                                        loTriFactor->csrMat->column_indices->data().get(),
1048                                        loTriFactorT->csrMat->values->data().get(),
1049                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1050                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1051                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1052   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1053 #endif
1054 
1055   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1056   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1057                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1058                           loTriFactor->csrMat->values->data().get(),
1059                           loTriFactor->csrMat->row_offsets->data().get(),
1060                           loTriFactor->csrMat->column_indices->data().get(),
1061                           loTriFactorT->csrMat->values->data().get(),
1062                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1063                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1064                           CUSPARSE_ACTION_NUMERIC, indexBase,
1065                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1066                         #else
1067                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                           CUSPARSE_ACTION_NUMERIC, indexBase
1069                         #endif
1070 );CHKERRCUSPARSE(stat);
1071   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1072   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1073 
1074   /* Create the solve analysis information */
1075   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1076   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1078   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1080                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1081                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1082                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1083   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1084 #endif
1085 
1086   /* perform the solve analysis */
1087   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1088                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1089                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1091                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1092                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1093                           #endif
1094 );CHKERRCUSPARSE(stat);
1095   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1096   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1097 
1098   /* assign the pointer */
1099   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100 
1101   /*********************************************/
1102   /* Now the Transpose of the Upper Tri Factor */
1103   /*********************************************/
1104 
1105   /* allocate space for the transpose of the upper triangular factor */
1106   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1107   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108 
1109   /* set the matrix descriptors of the upper triangular factor */
1110   matrixType = cusparseGetMatType(upTriFactor->descr);
1111   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1112   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1113     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1114   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115 
1116   /* Create the matrix description */
1117   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122 
1123   /* set the operation */
1124   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125 
1126   /* allocate GPU space for the CSC of the upper triangular factor*/
1127   upTriFactorT->csrMat = new CsrMatrix;
1128   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1129   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1130   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1131   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1132   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1133   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134 
1135   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1137   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1138                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                                 upTriFactor->csrMat->values->data().get(),
1140                                 upTriFactor->csrMat->row_offsets->data().get(),
1141                                 upTriFactor->csrMat->column_indices->data().get(),
1142                                 upTriFactorT->csrMat->values->data().get(),
1143                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1145                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1146   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1147 #endif
1148 
1149   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1150   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1151                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1152                           upTriFactor->csrMat->values->data().get(),
1153                           upTriFactor->csrMat->row_offsets->data().get(),
1154                           upTriFactor->csrMat->column_indices->data().get(),
1155                           upTriFactorT->csrMat->values->data().get(),
1156                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1157                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1158                           CUSPARSE_ACTION_NUMERIC, indexBase,
1159                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1160                         #else
1161                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                           CUSPARSE_ACTION_NUMERIC, indexBase
1163                         #endif
1164 );CHKERRCUSPARSE(stat);
1165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1166   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1167 
1168   /* Create the solve analysis information */
1169   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1170   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1171   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1172   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1176                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1177   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1178   #endif
1179 
1180   /* perform the solve analysis */
1181   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1182                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1183                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1184                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1185                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1186                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1187                           #endif
1188 );CHKERRCUSPARSE(stat);
1189   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1190   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1191 
1192   /* assign the pointer */
1193   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1198 {
1199   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1200   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   cusparseStatus_t             stat;
1204   cusparseIndexBase_t          indexBase;
1205   cudaError_t                  err;
1206   PetscErrorCode               ierr;
1207 
1208   PetscFunctionBegin;
1209   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1212   /* create cusparse matrix */
1213   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1214   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1215   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1216   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1217   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1218 
1219   /* set alpha and beta */
1220   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1221   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1222   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1227 
1228   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1229     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1230     CsrMatrix *matrixT= new CsrMatrix;
1231     matrixT->num_rows = A->cmap->n;
1232     matrixT->num_cols = A->rmap->n;
1233     matrixT->num_entries = a->nz;
1234     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1235     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1236     matrixT->values = new THRUSTARRAY(a->nz);
1237 
1238     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1239     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1240 
1241     /* compute the transpose, i.e. the CSC */
1242    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1243     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1244                                   A->cmap->n, matrix->num_entries,
1245                                   matrix->values->data().get(),
1246                                   cusparsestruct->rowoffsets_gpu->data().get(),
1247                                   matrix->column_indices->data().get(),
1248                                   matrixT->values->data().get(),
1249                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1251                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1252     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1253    #endif
1254 
1255     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1256                             A->cmap->n, matrix->num_entries,
1257                             matrix->values->data().get(),
1258                             cusparsestruct->rowoffsets_gpu->data().get(),
1259                             matrix->column_indices->data().get(),
1260                             matrixT->values->data().get(),
1261                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1263                             CUSPARSE_ACTION_NUMERIC,indexBase,
1264                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1265                           #else
1266                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1267                             CUSPARSE_ACTION_NUMERIC, indexBase
1268                           #endif
1269 );CHKERRCUSPARSE(stat);
1270     matstructT->mat = matrixT;
1271 
1272    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273     stat = cusparseCreateCsr(&matstructT->matDescr,
1274                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1275                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1276                              matrixT->values->data().get(),
1277                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1278                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1279    #endif
1280   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1281    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1282     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1283    #else
1284     CsrMatrix *temp  = new CsrMatrix;
1285     CsrMatrix *tempT = new CsrMatrix;
1286     /* First convert HYB to CSR */
1287     temp->num_rows = A->rmap->n;
1288     temp->num_cols = A->cmap->n;
1289     temp->num_entries = a->nz;
1290     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1292     temp->values = new THRUSTARRAY(a->nz);
1293 
1294     stat = cusparse_hyb2csr(cusparsestruct->handle,
1295                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1296                             temp->values->data().get(),
1297                             temp->row_offsets->data().get(),
1298                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1299 
1300     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1301     tempT->num_rows = A->rmap->n;
1302     tempT->num_cols = A->cmap->n;
1303     tempT->num_entries = a->nz;
1304     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1305     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1306     tempT->values = new THRUSTARRAY(a->nz);
1307 
1308     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1309                             temp->num_cols, temp->num_entries,
1310                             temp->values->data().get(),
1311                             temp->row_offsets->data().get(),
1312                             temp->column_indices->data().get(),
1313                             tempT->values->data().get(),
1314                             tempT->column_indices->data().get(),
1315                             tempT->row_offsets->data().get(),
1316                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1317 
1318     /* Last, convert CSC to HYB */
1319     cusparseHybMat_t hybMat;
1320     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1321     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1322       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1323     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1324                             matstructT->descr, tempT->values->data().get(),
1325                             tempT->row_offsets->data().get(),
1326                             tempT->column_indices->data().get(),
1327                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1328 
1329     /* assign the pointer */
1330     matstructT->mat = hybMat;
1331     /* delete temporaries */
1332     if (tempT) {
1333       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1334       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1335       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1336       delete (CsrMatrix*) tempT;
1337     }
1338     if (temp) {
1339       if (temp->values) delete (THRUSTARRAY*) temp->values;
1340       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1341       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1342       delete (CsrMatrix*) temp;
1343     }
1344    #endif
1345   }
1346   err  = WaitForCUDA();CHKERRCUDA(err);
1347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1348   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1349   /* the compressed row indices is not used for matTranspose */
1350   matstructT->cprowIndices = NULL;
1351   /* assign the pointer */
1352   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1358 {
1359   PetscInt                              n = xx->map->n;
1360   const PetscScalar                     *barray;
1361   PetscScalar                           *xarray;
1362   thrust::device_ptr<const PetscScalar> bGPU;
1363   thrust::device_ptr<PetscScalar>       xGPU;
1364   cusparseStatus_t                      stat;
1365   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1366   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1367   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1368   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1369   PetscErrorCode                        ierr;
1370   cudaError_t                           cerr;
1371 
1372   PetscFunctionBegin;
1373   /* Analyze the matrix and create the transpose ... on the fly */
1374   if (!loTriFactorT && !upTriFactorT) {
1375     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1376     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1377     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1378   }
1379 
1380   /* Get the GPU pointers */
1381   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1382   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1383   xGPU = thrust::device_pointer_cast(xarray);
1384   bGPU = thrust::device_pointer_cast(barray);
1385 
1386   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1387   /* First, reorder with the row permutation */
1388   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1389                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1390                xGPU);
1391 
1392   /* First, solve U */
1393   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1394                         upTriFactorT->csrMat->num_rows,
1395                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1396                         upTriFactorT->csrMat->num_entries,
1397                       #endif
1398                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1399                         upTriFactorT->csrMat->values->data().get(),
1400                         upTriFactorT->csrMat->row_offsets->data().get(),
1401                         upTriFactorT->csrMat->column_indices->data().get(),
1402                         upTriFactorT->solveInfo,
1403                         xarray, tempGPU->data().get()
1404                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1405                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1406                       #endif
1407 );CHKERRCUSPARSE(stat);
1408 
1409   /* Then, solve L */
1410   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1411                         loTriFactorT->csrMat->num_rows,
1412                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1413                         loTriFactorT->csrMat->num_entries,
1414                       #endif
1415                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1416                         loTriFactorT->csrMat->values->data().get(),
1417                         loTriFactorT->csrMat->row_offsets->data().get(),
1418                         loTriFactorT->csrMat->column_indices->data().get(),
1419                         loTriFactorT->solveInfo,
1420                         tempGPU->data().get(), xarray
1421                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1422                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1423                       #endif
1424 );CHKERRCUSPARSE(stat);
1425 
1426   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1427   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1428                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1429                tempGPU->begin());
1430 
1431   /* Copy the temporary to the full solution. */
1432   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1433 
1434   /* restore */
1435   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1437   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1438   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   const PetscScalar                 *barray;
1446   PetscScalar                       *xarray;
1447   cusparseStatus_t                  stat;
1448   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1449   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1450   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1451   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1452   PetscErrorCode                    ierr;
1453   cudaError_t                       cerr;
1454 
1455   PetscFunctionBegin;
1456   /* Analyze the matrix and create the transpose ... on the fly */
1457   if (!loTriFactorT && !upTriFactorT) {
1458     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1459     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   }
1462 
1463   /* Get the GPU pointers */
1464   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, solve U */
1469   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1470                         upTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         upTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1475                         upTriFactorT->csrMat->values->data().get(),
1476                         upTriFactorT->csrMat->row_offsets->data().get(),
1477                         upTriFactorT->csrMat->column_indices->data().get(),
1478                         upTriFactorT->solveInfo,
1479                         barray, tempGPU->data().get()
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Then, solve L */
1486   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1487                         loTriFactorT->csrMat->num_rows,
1488                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1489                         loTriFactorT->csrMat->num_entries,
1490                       #endif
1491                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1492                         loTriFactorT->csrMat->values->data().get(),
1493                         loTriFactorT->csrMat->row_offsets->data().get(),
1494                         loTriFactorT->csrMat->column_indices->data().get(),
1495                         loTriFactorT->solveInfo,
1496                         tempGPU->data().get(), xarray
1497                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1498                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1499                       #endif
1500 );CHKERRCUSPARSE(stat);
1501 
1502   /* restore */
1503   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1504   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1505   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1506   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1512 {
1513   const PetscScalar                     *barray;
1514   PetscScalar                           *xarray;
1515   thrust::device_ptr<const PetscScalar> bGPU;
1516   thrust::device_ptr<PetscScalar>       xGPU;
1517   cusparseStatus_t                      stat;
1518   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1519   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1520   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1521   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1522   PetscErrorCode                        ierr;
1523   cudaError_t                           cerr;
1524 
1525   PetscFunctionBegin;
1526 
1527   /* Get the GPU pointers */
1528   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530   xGPU = thrust::device_pointer_cast(xarray);
1531   bGPU = thrust::device_pointer_cast(barray);
1532 
1533   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534   /* First, reorder with the row permutation */
1535   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1536                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1537                tempGPU->begin());
1538 
1539   /* Next, solve L */
1540   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1541                         loTriFactor->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         loTriFactor->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1546                         loTriFactor->csrMat->values->data().get(),
1547                         loTriFactor->csrMat->row_offsets->data().get(),
1548                         loTriFactor->csrMat->column_indices->data().get(),
1549                         loTriFactor->solveInfo,
1550                         tempGPU->data().get(), xarray
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1553                       #endif
1554 );CHKERRCUSPARSE(stat);
1555 
1556   /* Then, solve U */
1557   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1558                         upTriFactor->csrMat->num_rows,
1559                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1560                         upTriFactor->csrMat->num_entries,
1561                       #endif
1562                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1563                         upTriFactor->csrMat->values->data().get(),
1564                         upTriFactor->csrMat->row_offsets->data().get(),
1565                         upTriFactor->csrMat->column_indices->data().get(),
1566                         upTriFactor->solveInfo,
1567                         xarray, tempGPU->data().get()
1568                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1570                       #endif
1571 );CHKERRCUSPARSE(stat);
1572 
1573   /* Last, reorder with the column permutation */
1574   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1576                xGPU);
1577 
1578   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1581   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   cusparseStatus_t                  stat;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595   PetscErrorCode                    ierr;
1596   cudaError_t                       cerr;
1597 
1598   PetscFunctionBegin;
1599   /* Get the GPU pointers */
1600   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1601   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1602 
1603   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1604   /* First, solve L */
1605   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1606                         loTriFactor->csrMat->num_rows,
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         loTriFactor->csrMat->num_entries,
1609                       #endif
1610                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1611                         loTriFactor->csrMat->values->data().get(),
1612                         loTriFactor->csrMat->row_offsets->data().get(),
1613                         loTriFactor->csrMat->column_indices->data().get(),
1614                         loTriFactor->solveInfo,
1615                         barray, tempGPU->data().get()
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1618                       #endif
1619 );CHKERRCUSPARSE(stat);
1620 
1621   /* Next, solve U */
1622   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1623                         upTriFactor->csrMat->num_rows,
1624                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1625                         upTriFactor->csrMat->num_entries,
1626                       #endif
1627                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1628                         upTriFactor->csrMat->values->data().get(),
1629                         upTriFactor->csrMat->row_offsets->data().get(),
1630                         upTriFactor->csrMat->column_indices->data().get(),
1631                         upTriFactor->solveInfo,
1632                         tempGPU->data().get(), xarray
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1635                       #endif
1636 );CHKERRCUSPARSE(stat);
1637 
1638   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1639   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1641   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1642   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1650   cudaError_t        cerr;
1651   PetscErrorCode     ierr;
1652 
1653   PetscFunctionBegin;
1654   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1655     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1656 
1657     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1658     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1659     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1660     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1661     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1662     A->offloadmask = PETSC_OFFLOAD_BOTH;
1663   }
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1668 {
1669   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1674   *array = a->a;
1675   A->offloadmask = PETSC_OFFLOAD_CPU;
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1680 {
1681   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1683   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1684   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1685   PetscErrorCode               ierr;
1686   cusparseStatus_t             stat;
1687   cudaError_t                  err;
1688 
1689   PetscFunctionBegin;
1690   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1691   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1692     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1693       /* Copy values only */
1694       CsrMatrix *matrix,*matrixT;
1695       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1696 
1697       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1698       matrix->values->assign(a->a, a->a+a->nz);
1699       err  = WaitForCUDA();CHKERRCUDA(err);
1700       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1701       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1702 
1703       /* Update matT when it was built before */
1704       if (cusparsestruct->matTranspose) {
1705         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1706         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1707         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1708         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1709                             A->cmap->n, matrix->num_entries,
1710                             matrix->values->data().get(),
1711                             cusparsestruct->rowoffsets_gpu->data().get(),
1712                             matrix->column_indices->data().get(),
1713                             matrixT->values->data().get(),
1714                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1715                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1716                             CUSPARSE_ACTION_NUMERIC,indexBase,
1717                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1718                           #else
1719                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1720                             CUSPARSE_ACTION_NUMERIC, indexBase
1721                           #endif
1722 );CHKERRCUSPARSE(stat);
1723         err  = WaitForCUDA();CHKERRCUDA(err);
1724         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1725       }
1726     } else {
1727       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1728       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1729       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1730       delete cusparsestruct->workVector;
1731       delete cusparsestruct->rowoffsets_gpu;
1732       try {
1733         if (a->compressedrow.use) {
1734           m    = a->compressedrow.nrows;
1735           ii   = a->compressedrow.i;
1736           ridx = a->compressedrow.rindex;
1737         } else {
1738           m    = A->rmap->n;
1739           ii   = a->i;
1740           ridx = NULL;
1741         }
1742         cusparsestruct->nrows = m;
1743 
1744         /* create cusparse matrix */
1745         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1746         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1747         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1748         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1749 
1750         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1751         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1752         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1753         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1754         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1755         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1756         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1757 
1758         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1759         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1760           /* set the matrix */
1761           CsrMatrix *mat= new CsrMatrix;
1762           mat->num_rows = m;
1763           mat->num_cols = A->cmap->n;
1764           mat->num_entries = a->nz;
1765           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1766           mat->row_offsets->assign(ii, ii + m+1);
1767 
1768           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1769           mat->column_indices->assign(a->j, a->j+a->nz);
1770 
1771           mat->values = new THRUSTARRAY(a->nz);
1772           mat->values->assign(a->a, a->a+a->nz);
1773 
1774           /* assign the pointer */
1775           matstruct->mat = mat;
1776          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1777           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1778             stat = cusparseCreateCsr(&matstruct->matDescr,
1779                                     mat->num_rows, mat->num_cols, mat->num_entries,
1780                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1781                                     mat->values->data().get(),
1782                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1783                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1784           }
1785          #endif
1786         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1787          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1788           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1789          #else
1790           CsrMatrix *mat= new CsrMatrix;
1791           mat->num_rows = m;
1792           mat->num_cols = A->cmap->n;
1793           mat->num_entries = a->nz;
1794           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1795           mat->row_offsets->assign(ii, ii + m+1);
1796 
1797           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1798           mat->column_indices->assign(a->j, a->j+a->nz);
1799 
1800           mat->values = new THRUSTARRAY(a->nz);
1801           mat->values->assign(a->a, a->a+a->nz);
1802 
1803           cusparseHybMat_t hybMat;
1804           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1805           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1806             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1807           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1808               matstruct->descr, mat->values->data().get(),
1809               mat->row_offsets->data().get(),
1810               mat->column_indices->data().get(),
1811               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1812           /* assign the pointer */
1813           matstruct->mat = hybMat;
1814 
1815           if (mat) {
1816             if (mat->values) delete (THRUSTARRAY*)mat->values;
1817             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1818             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1819             delete (CsrMatrix*)mat;
1820           }
1821          #endif
1822         }
1823 
1824         /* assign the compressed row indices */
1825         if (a->compressedrow.use) {
1826           cusparsestruct->workVector = new THRUSTARRAY(m);
1827           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1828           matstruct->cprowIndices->assign(ridx,ridx+m);
1829           tmp = m;
1830         } else {
1831           cusparsestruct->workVector = NULL;
1832           matstruct->cprowIndices    = NULL;
1833           tmp = 0;
1834         }
1835         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1836 
1837         /* assign the pointer */
1838         cusparsestruct->mat = matstruct;
1839       } catch(char *ex) {
1840         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1841       }
1842       err  = WaitForCUDA();CHKERRCUDA(err);
1843       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1844       cusparsestruct->nonzerostate = A->nonzerostate;
1845     }
1846     A->offloadmask = PETSC_OFFLOAD_BOTH;
1847   }
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 struct VecCUDAPlusEquals
1852 {
1853   template <typename Tuple>
1854   __host__ __device__
1855   void operator()(Tuple t)
1856   {
1857     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1858   }
1859 };
1860 
1861 struct VecCUDAEquals
1862 {
1863   template <typename Tuple>
1864   __host__ __device__
1865   void operator()(Tuple t)
1866   {
1867     thrust::get<1>(t) = thrust::get<0>(t);
1868   }
1869 };
1870 
1871 struct VecCUDAEqualsReverse
1872 {
1873   template <typename Tuple>
1874   __host__ __device__
1875   void operator()(Tuple t)
1876   {
1877     thrust::get<0>(t) = thrust::get<1>(t);
1878   }
1879 };
1880 
1881 struct MatMatCusparse {
1882   PetscBool             cisdense;
1883   PetscScalar           *Bt;
1884   Mat                   X;
1885   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1886   PetscLogDouble        flops;
1887   CsrMatrix             *Bcsr;
1888 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1889   cusparseSpMatDescr_t  matSpBDescr;
1890   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1891   cusparseDnMatDescr_t  matBDescr;
1892   cusparseDnMatDescr_t  matCDescr;
1893   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1894   size_t                mmBufferSize;
1895   void                  *mmBuffer;
1896   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1897   cusparseSpGEMMDescr_t spgemmDesc;
1898 #endif
1899 };
1900 
1901 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1902 {
1903   PetscErrorCode   ierr;
1904   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1905   cudaError_t      cerr;
1906  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1907   cusparseStatus_t stat;
1908  #endif
1909 
1910   PetscFunctionBegin;
1911   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1912   delete mmdata->Bcsr;
1913  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1914   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1915   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1916   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1917   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1918   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1919   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1920  #endif
1921   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1922   ierr = PetscFree(data);CHKERRQ(ierr);
1923   PetscFunctionReturn(0);
1924 }
1925 
1926 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1927 
1928 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929 {
1930   Mat_Product                  *product = C->product;
1931   Mat                          A,B;
1932   PetscInt                     m,n,blda,clda;
1933   PetscBool                    flg,biscuda;
1934   Mat_SeqAIJCUSPARSE           *cusp;
1935   cusparseStatus_t             stat;
1936   cusparseOperation_t          opA;
1937   const PetscScalar            *barray;
1938   PetscScalar                  *carray;
1939   PetscErrorCode               ierr;
1940   MatMatCusparse               *mmdata;
1941   Mat_SeqAIJCUSPARSEMultStruct *mat;
1942   CsrMatrix                    *csrmat;
1943   cudaError_t                  cerr;
1944 
1945   PetscFunctionBegin;
1946   MatCheckProduct(C,1);
1947   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1948   mmdata = (MatMatCusparse*)product->data;
1949   A    = product->A;
1950   B    = product->B;
1951   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1952   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1953   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1954      Instead of silently accepting the wrong answer, I prefer to raise the error */
1955   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1956   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1957   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1958   switch (product->type) {
1959   case MATPRODUCT_AB:
1960   case MATPRODUCT_PtAP:
1961     mat = cusp->mat;
1962     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1963     m   = A->rmap->n;
1964     n   = B->cmap->n;
1965     break;
1966   case MATPRODUCT_AtB:
1967     if (!cusp->transgen) {
1968       mat = cusp->mat;
1969       opA = CUSPARSE_OPERATION_TRANSPOSE;
1970     } else {
1971       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1972       mat  = cusp->matTranspose;
1973       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1974     }
1975     m = A->cmap->n;
1976     n = B->cmap->n;
1977     break;
1978   case MATPRODUCT_ABt:
1979   case MATPRODUCT_RARt:
1980     mat = cusp->mat;
1981     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1982     m   = A->rmap->n;
1983     n   = B->rmap->n;
1984     break;
1985   default:
1986     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1987   }
1988   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1989   csrmat = (CsrMatrix*)mat->mat;
1990   /* if the user passed a CPU matrix, copy the data to the GPU */
1991   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1992   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1993   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1994 
1995   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1996   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1997     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1998     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1999   } else {
2000     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2001     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2002   }
2003 
2004   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2005  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2006   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2007   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2008   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2009     size_t mmBufferSize;
2010     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2011     if (!mmdata->matBDescr) {
2012       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2013       mmdata->Blda = blda;
2014     }
2015 
2016     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2017     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2018       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2019       mmdata->Clda = clda;
2020     }
2021 
2022     if (!mat->matDescr) {
2023       stat = cusparseCreateCsr(&mat->matDescr,
2024                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2025                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2026                                csrmat->values->data().get(),
2027                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2028                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2029     }
2030     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2031                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2032                                    mmdata->matCDescr,cusparse_scalartype,
2033                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2034     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2035       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2036       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2037       mmdata->mmBufferSize = mmBufferSize;
2038     }
2039     mmdata->initialized = PETSC_TRUE;
2040   } else {
2041     /* to be safe, always update pointers of the mats */
2042     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2043     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2044     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2045   }
2046 
2047   /* do cusparseSpMM, which supports transpose on B */
2048   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2049                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2050                       mmdata->matCDescr,cusparse_scalartype,
2051                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2052  #else
2053   PetscInt k;
2054   /* cusparseXcsrmm does not support transpose on B */
2055   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2056     cublasHandle_t cublasv2handle;
2057     cublasStatus_t cerr;
2058 
2059     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2060     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2061                        B->cmap->n,B->rmap->n,
2062                        &PETSC_CUSPARSE_ONE ,barray,blda,
2063                        &PETSC_CUSPARSE_ZERO,barray,blda,
2064                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2065     blda = B->cmap->n;
2066     k    = B->cmap->n;
2067   } else {
2068     k    = B->rmap->n;
2069   }
2070 
2071   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2072   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2073                            csrmat->num_entries,mat->alpha_one,mat->descr,
2074                            csrmat->values->data().get(),
2075                            csrmat->row_offsets->data().get(),
2076                            csrmat->column_indices->data().get(),
2077                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2078                            carray,clda);CHKERRCUSPARSE(stat);
2079  #endif
2080   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2081   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2082   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2083   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2084   if (product->type == MATPRODUCT_RARt) {
2085     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2086     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2087   } else if (product->type == MATPRODUCT_PtAP) {
2088     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2089     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2090   } else {
2091     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2092   }
2093   if (mmdata->cisdense) {
2094     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2095   }
2096   if (!biscuda) {
2097     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2098   }
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2103 {
2104   Mat_Product        *product = C->product;
2105   Mat                A,B;
2106   PetscInt           m,n;
2107   PetscBool          cisdense,flg;
2108   PetscErrorCode     ierr;
2109   MatMatCusparse     *mmdata;
2110   Mat_SeqAIJCUSPARSE *cusp;
2111 
2112   PetscFunctionBegin;
2113   MatCheckProduct(C,1);
2114   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2115   A    = product->A;
2116   B    = product->B;
2117   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2118   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2119   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2120   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2121   switch (product->type) {
2122   case MATPRODUCT_AB:
2123     m = A->rmap->n;
2124     n = B->cmap->n;
2125     break;
2126   case MATPRODUCT_AtB:
2127     m = A->cmap->n;
2128     n = B->cmap->n;
2129     break;
2130   case MATPRODUCT_ABt:
2131     m = A->rmap->n;
2132     n = B->rmap->n;
2133     break;
2134   case MATPRODUCT_PtAP:
2135     m = B->cmap->n;
2136     n = B->cmap->n;
2137     break;
2138   case MATPRODUCT_RARt:
2139     m = B->rmap->n;
2140     n = B->rmap->n;
2141     break;
2142   default:
2143     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2144   }
2145   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2146   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2147   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2148   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2149 
2150   /* product data */
2151   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2152   mmdata->cisdense = cisdense;
2153  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2154   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2155   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2156     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2157   }
2158  #endif
2159   /* for these products we need intermediate storage */
2160   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2161     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2162     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2163     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2164       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2165     } else {
2166       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2167     }
2168   }
2169   C->product->data    = mmdata;
2170   C->product->destroy = MatDestroy_MatMatCusparse;
2171 
2172   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2177 {
2178   Mat_Product                  *product = C->product;
2179   Mat                          A,B;
2180   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2181   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2182   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2183   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2184   PetscBool                    flg;
2185   PetscErrorCode               ierr;
2186   cusparseStatus_t             stat;
2187   cudaError_t                  cerr;
2188   MatProductType               ptype;
2189   MatMatCusparse               *mmdata;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2191   cusparseSpMatDescr_t         BmatSpDescr;
2192 #endif
2193 
2194   PetscFunctionBegin;
2195   MatCheckProduct(C,1);
2196   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2197   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2198   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2199   mmdata = (MatMatCusparse*)C->product->data;
2200   A = product->A;
2201   B = product->B;
2202   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2203     mmdata->reusesym = PETSC_FALSE;
2204     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2205     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2206     Cmat = Ccusp->mat;
2207     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2208     Ccsr = (CsrMatrix*)Cmat->mat;
2209     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2210     goto finalize;
2211   }
2212   if (!c->nz) goto finalize;
2213   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2214   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2215   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2216   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2217   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2218   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2219   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2220   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2221   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2222   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2223   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2224   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2225   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2226   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2227 
2228   ptype = product->type;
2229   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2230   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2231   switch (ptype) {
2232   case MATPRODUCT_AB:
2233     Amat = Acusp->mat;
2234     Bmat = Bcusp->mat;
2235     break;
2236   case MATPRODUCT_AtB:
2237     Amat = Acusp->matTranspose;
2238     Bmat = Bcusp->mat;
2239     break;
2240   case MATPRODUCT_ABt:
2241     Amat = Acusp->mat;
2242     Bmat = Bcusp->matTranspose;
2243     break;
2244   default:
2245     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2246   }
2247   Cmat = Ccusp->mat;
2248   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2249   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2250   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2251   Acsr = (CsrMatrix*)Amat->mat;
2252   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2253   Ccsr = (CsrMatrix*)Cmat->mat;
2254   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2255   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2256   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2257   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2259   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2260   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2261                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2262                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2263                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2264   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2265                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2266                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2267 #else
2268   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2269                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2270                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2271                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2272                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2273 #endif
2274   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2275   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2276   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2277   C->offloadmask = PETSC_OFFLOAD_GPU;
2278 finalize:
2279   /* shorter version of MatAssemblyEnd_SeqAIJ */
2280   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2281   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2282   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2283   c->reallocs         = 0;
2284   C->info.mallocs    += 0;
2285   C->info.nz_unneeded = 0;
2286   C->assembled = C->was_assembled = PETSC_TRUE;
2287   C->num_ass++;
2288   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2289   ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr);
2290   PetscFunctionReturn(0);
2291 }
2292 
2293 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2294 {
2295   Mat_Product                  *product = C->product;
2296   Mat                          A,B;
2297   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2298   Mat_SeqAIJ                   *a,*b,*c;
2299   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2300   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2301   PetscInt                     i,j,m,n,k;
2302   PetscBool                    flg;
2303   PetscErrorCode               ierr;
2304   cusparseStatus_t             stat;
2305   cudaError_t                  cerr;
2306   MatProductType               ptype;
2307   MatMatCusparse               *mmdata;
2308   PetscLogDouble               flops;
2309   PetscBool                    biscompressed,ciscompressed;
2310 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2311   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2312   size_t                       bufSize2;
2313   cusparseSpMatDescr_t         BmatSpDescr;
2314 #else
2315   int                          cnz;
2316 #endif
2317 
2318   PetscFunctionBegin;
2319   MatCheckProduct(C,1);
2320   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2321   A    = product->A;
2322   B    = product->B;
2323   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2324   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2325   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2326   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2327   a = (Mat_SeqAIJ*)A->data;
2328   b = (Mat_SeqAIJ*)B->data;
2329   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2330   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2331   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2332   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2333 
2334   /* product data */
2335   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2336   C->product->data    = mmdata;
2337   C->product->destroy = MatDestroy_MatMatCusparse;
2338 
2339   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2340   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2341   ptype = product->type;
2342   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2343   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2344   biscompressed = PETSC_FALSE;
2345   ciscompressed = PETSC_FALSE;
2346   switch (ptype) {
2347   case MATPRODUCT_AB:
2348     m = A->rmap->n;
2349     n = B->cmap->n;
2350     k = A->cmap->n;
2351     Amat = Acusp->mat;
2352     Bmat = Bcusp->mat;
2353     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2354     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2355     break;
2356   case MATPRODUCT_AtB:
2357     m = A->cmap->n;
2358     n = B->cmap->n;
2359     k = A->rmap->n;
2360     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2361     Amat = Acusp->matTranspose;
2362     Bmat = Bcusp->mat;
2363     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2364     break;
2365   case MATPRODUCT_ABt:
2366     m = A->rmap->n;
2367     n = B->rmap->n;
2368     k = A->cmap->n;
2369     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2370     Amat = Acusp->mat;
2371     Bmat = Bcusp->matTranspose;
2372     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2373     break;
2374   default:
2375     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2376   }
2377 
2378   /* create cusparse matrix */
2379   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2380   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2381   c     = (Mat_SeqAIJ*)C->data;
2382   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2383   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2384   Ccsr  = new CsrMatrix;
2385 
2386   c->compressedrow.use = ciscompressed;
2387   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2388     c->compressedrow.nrows = a->compressedrow.nrows;
2389     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2390     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2391     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2392     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2393     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2394   } else {
2395     c->compressedrow.nrows  = 0;
2396     c->compressedrow.i      = NULL;
2397     c->compressedrow.rindex = NULL;
2398     Ccusp->workVector       = NULL;
2399     Cmat->cprowIndices      = NULL;
2400   }
2401   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2402   Ccusp->mat      = Cmat;
2403   Ccusp->mat->mat = Ccsr;
2404   Ccsr->num_rows    = Ccusp->nrows;
2405   Ccsr->num_cols    = n;
2406   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2407   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2408   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2409   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2410   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2411   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2412   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2413   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2414   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2415   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2416   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2417     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2418     c->nz = 0;
2419     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2420     Ccsr->values = new THRUSTARRAY(c->nz);
2421     goto finalizesym;
2422   }
2423 
2424   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2425   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2426   Acsr = (CsrMatrix*)Amat->mat;
2427   if (!biscompressed) {
2428     Bcsr = (CsrMatrix*)Bmat->mat;
2429 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2430     BmatSpDescr = Bmat->matDescr;
2431 #endif
2432   } else { /* we need to use row offsets for the full matrix */
2433     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2434     Bcsr = new CsrMatrix;
2435     Bcsr->num_rows       = B->rmap->n;
2436     Bcsr->num_cols       = cBcsr->num_cols;
2437     Bcsr->num_entries    = cBcsr->num_entries;
2438     Bcsr->column_indices = cBcsr->column_indices;
2439     Bcsr->values         = cBcsr->values;
2440     if (!Bcusp->rowoffsets_gpu) {
2441       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2442       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2443       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2444     }
2445     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2446     mmdata->Bcsr = Bcsr;
2447 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2448     if (Bcsr->num_rows && Bcsr->num_cols) {
2449       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2450                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2451                                Bcsr->values->data().get(),
2452                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2453                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2454     }
2455     BmatSpDescr = mmdata->matSpBDescr;
2456 #endif
2457   }
2458   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2459   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2460   /* precompute flops count */
2461   if (ptype == MATPRODUCT_AB) {
2462     for (i=0, flops = 0; i<A->rmap->n; i++) {
2463       const PetscInt st = a->i[i];
2464       const PetscInt en = a->i[i+1];
2465       for (j=st; j<en; j++) {
2466         const PetscInt brow = a->j[j];
2467         flops += 2.*(b->i[brow+1] - b->i[brow]);
2468       }
2469     }
2470   } else if (ptype == MATPRODUCT_AtB) {
2471     for (i=0, flops = 0; i<A->rmap->n; i++) {
2472       const PetscInt anzi = a->i[i+1] - a->i[i];
2473       const PetscInt bnzi = b->i[i+1] - b->i[i];
2474       flops += (2.*anzi)*bnzi;
2475     }
2476   } else { /* TODO */
2477     flops = 0.;
2478   }
2479 
2480   mmdata->flops = flops;
2481   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2482 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2483   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2484   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2485                            NULL, NULL, NULL,
2486                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2487                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2488   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2489   /* ask bufferSize bytes for external memory */
2490   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2491                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2492                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2493                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2494   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat);
2495   /* inspect the matrices A and B to understand the memory requirement for the next step */
2496   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2497                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2498                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2499                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2500   /* ask bufferSize again bytes for external memory */
2501   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2502                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2503                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2504                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2505   /* The CUSPARSE documentation is not clear, nor the API
2506      We need both buffers to perform the operations properly!
2507      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2508      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2509      is stored in the descriptor! What a messy API... */
2510   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat);
2511   /* compute the intermediate product of A * B */
2512   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2513                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2514                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2515                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2516   /* get matrix C non-zero entries C_nnz1 */
2517   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2518   c->nz = (PetscInt) C_nnz1;
2519   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2520   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2521   Ccsr->values = new THRUSTARRAY(c->nz);
2522   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2523   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2524                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2525   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2526                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2527                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2528 #else
2529   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2530   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2532                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2533                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2534                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2535   c->nz = cnz;
2536   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2537   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2538   Ccsr->values = new THRUSTARRAY(c->nz);
2539   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2540 
2541   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2542   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2543      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2544      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2545   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2546                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2547                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2548                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2549                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2550 #endif
2551   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2552   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2553   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2554 finalizesym:
2555   c->singlemalloc = PETSC_FALSE;
2556   c->free_a       = PETSC_TRUE;
2557   c->free_ij      = PETSC_TRUE;
2558   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2559   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2560   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2561     PetscInt *d_i = c->i;
2562     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2563     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2564     ii   = *Ccsr->row_offsets;
2565     jj   = *Ccsr->column_indices;
2566     if (ciscompressed) d_i = c->compressedrow.i;
2567     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2568     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2569   } else {
2570     PetscInt *d_i = c->i;
2571     if (ciscompressed) d_i = c->compressedrow.i;
2572     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2573     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2574   }
2575   if (ciscompressed) { /* need to expand host row offsets */
2576     PetscInt r = 0;
2577     c->i[0] = 0;
2578     for (k = 0; k < c->compressedrow.nrows; k++) {
2579       const PetscInt next = c->compressedrow.rindex[k];
2580       const PetscInt old = c->compressedrow.i[k];
2581       for (; r < next; r++) c->i[r+1] = old;
2582     }
2583     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2584   }
2585   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2586   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2587   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2588   c->maxnz = c->nz;
2589   c->nonzerorowcnt = 0;
2590   c->rmax = 0;
2591   for (k = 0; k < m; k++) {
2592     const PetscInt nn = c->i[k+1] - c->i[k];
2593     c->ilen[k] = c->imax[k] = nn;
2594     c->nonzerorowcnt += (PetscInt)!!nn;
2595     c->rmax = PetscMax(c->rmax,nn);
2596   }
2597   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2598   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2599   Ccsr->num_entries = c->nz;
2600 
2601   C->nonzerostate++;
2602   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2603   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2604   Ccusp->nonzerostate = C->nonzerostate;
2605   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2606   C->preallocated  = PETSC_TRUE;
2607   C->assembled     = PETSC_FALSE;
2608   C->was_assembled = PETSC_FALSE;
2609   if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2610     mmdata->reusesym = PETSC_TRUE;
2611     C->offloadmask   = PETSC_OFFLOAD_GPU;
2612   }
2613   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2614   PetscFunctionReturn(0);
2615 }
2616 
2617 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2618 
2619 /* handles sparse or dense B */
2620 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2621 {
2622   Mat_Product    *product = mat->product;
2623   PetscErrorCode ierr;
2624   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2625 
2626   PetscFunctionBegin;
2627   MatCheckProduct(mat,1);
2628   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2629   if (!product->B->boundtocpu) {
2630     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2631   }
2632   if (product->type == MATPRODUCT_ABC) {
2633     Ciscusp = PETSC_FALSE;
2634     if (!product->C->boundtocpu) {
2635       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2636     }
2637   }
2638   if (isdense) {
2639     switch (product->type) {
2640     case MATPRODUCT_AB:
2641     case MATPRODUCT_AtB:
2642     case MATPRODUCT_ABt:
2643     case MATPRODUCT_PtAP:
2644     case MATPRODUCT_RARt:
2645      if (product->A->boundtocpu) {
2646         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2647       } else {
2648         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2649       }
2650       break;
2651     case MATPRODUCT_ABC:
2652       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2653       break;
2654     default:
2655       break;
2656     }
2657   } else if (Biscusp && Ciscusp) {
2658     switch (product->type) {
2659     case MATPRODUCT_AB:
2660     case MATPRODUCT_AtB:
2661     case MATPRODUCT_ABt:
2662       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2663       break;
2664     case MATPRODUCT_PtAP:
2665     case MATPRODUCT_RARt:
2666     case MATPRODUCT_ABC:
2667       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2668       break;
2669     default:
2670       break;
2671     }
2672   } else { /* fallback for AIJ */
2673     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2674   }
2675   PetscFunctionReturn(0);
2676 }
2677 
2678 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2679 {
2680   PetscErrorCode ierr;
2681 
2682   PetscFunctionBegin;
2683   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2684   PetscFunctionReturn(0);
2685 }
2686 
2687 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2688 {
2689   PetscErrorCode ierr;
2690 
2691   PetscFunctionBegin;
2692   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2693   PetscFunctionReturn(0);
2694 }
2695 
2696 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2697 {
2698   PetscErrorCode ierr;
2699 
2700   PetscFunctionBegin;
2701   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2702   PetscFunctionReturn(0);
2703 }
2704 
2705 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2706 {
2707   PetscErrorCode ierr;
2708 
2709   PetscFunctionBegin;
2710   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2711   PetscFunctionReturn(0);
2712 }
2713 
2714 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2715 {
2716   PetscErrorCode ierr;
2717 
2718   PetscFunctionBegin;
2719   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2720   PetscFunctionReturn(0);
2721 }
2722 
2723 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2724 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2725 {
2726   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2727   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2728   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2729   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2730   PetscErrorCode               ierr;
2731   cudaError_t                  cerr;
2732   cusparseStatus_t             stat;
2733   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2734   PetscBool                    compressed;
2735 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2736   PetscInt                     nx,ny;
2737 #endif
2738 
2739   PetscFunctionBegin;
2740   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2741   if (!a->nonzerorowcnt) {
2742     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2743     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2744     PetscFunctionReturn(0);
2745   }
2746   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2747   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2748   if (!trans) {
2749     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2750     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2751   } else {
2752     if (herm || !cusparsestruct->transgen) {
2753       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2754       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2755     } else {
2756       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2757       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2758     }
2759   }
2760   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2761   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2762 
2763   try {
2764     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2765     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2766     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2767 
2768     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2769     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2770       /* z = A x + beta y.
2771          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2772          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2773       */
2774       xptr = xarray;
2775       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2776       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2777      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2778       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2779           allocated to accommodate different uses. So we get the length info directly from mat.
2780        */
2781       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2782         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2783         nx = mat->num_cols;
2784         ny = mat->num_rows;
2785       }
2786      #endif
2787     } else {
2788       /* z = A^T x + beta y
2789          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2790          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2791        */
2792       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2793       dptr = zarray;
2794       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2795       if (compressed) { /* Scatter x to work vector */
2796         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2797         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2798                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2799                          VecCUDAEqualsReverse());
2800       }
2801      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2802       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2803         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2804         nx = mat->num_rows;
2805         ny = mat->num_cols;
2806       }
2807      #endif
2808     }
2809 
2810     /* csr_spmv does y = alpha op(A) x + beta y */
2811     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2812      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2814       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2815         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2816         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2817         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2818                                 matstruct->matDescr,
2819                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2820                                 matstruct->cuSpMV[opA].vecYDescr,
2821                                 cusparse_scalartype,
2822                                 cusparsestruct->spmvAlg,
2823                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2824         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2825 
2826         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2827       } else {
2828         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2829         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2830         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2831       }
2832 
2833       stat = cusparseSpMV(cusparsestruct->handle, opA,
2834                                matstruct->alpha_one,
2835                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2836                                matstruct->cuSpMV[opA].vecXDescr,
2837                                beta,
2838                                matstruct->cuSpMV[opA].vecYDescr,
2839                                cusparse_scalartype,
2840                                cusparsestruct->spmvAlg,
2841                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2842      #else
2843       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2844       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2845                                mat->num_rows, mat->num_cols,
2846                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2847                                mat->values->data().get(), mat->row_offsets->data().get(),
2848                                mat->column_indices->data().get(), xptr, beta,
2849                                dptr);CHKERRCUSPARSE(stat);
2850      #endif
2851     } else {
2852       if (cusparsestruct->nrows) {
2853        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2854         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2855        #else
2856         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2857         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2858                                  matstruct->alpha_one, matstruct->descr, hybMat,
2859                                  xptr, beta,
2860                                  dptr);CHKERRCUSPARSE(stat);
2861        #endif
2862       }
2863     }
2864     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2865     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2866 
2867     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2868       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2869         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2870           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2871         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2872           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2873         }
2874       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2875         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2876       }
2877 
2878       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2879       if (compressed) {
2880         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2881         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2882         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2883                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2884                          VecCUDAPlusEquals());
2885         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2886         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2887       }
2888     } else {
2889       if (yy && yy != zz) {
2890         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2891       }
2892     }
2893     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2894     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2895     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2896   } catch(char *ex) {
2897     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2898   }
2899   if (yy) {
2900     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2901   } else {
2902     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2903   }
2904   PetscFunctionReturn(0);
2905 }
2906 
2907 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2908 {
2909   PetscErrorCode ierr;
2910 
2911   PetscFunctionBegin;
2912   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2913   PetscFunctionReturn(0);
2914 }
2915 
2916 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2917 {
2918   PetscErrorCode              ierr;
2919   PetscSplitCSRDataStructure  *d_mat = NULL;
2920   PetscFunctionBegin;
2921   if (A->factortype == MAT_FACTOR_NONE) {
2922     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2923   }
2924   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2925   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2926   if (d_mat) {
2927     A->offloadmask = PETSC_OFFLOAD_GPU;
2928   }
2929 
2930   PetscFunctionReturn(0);
2931 }
2932 
2933 /* --------------------------------------------------------------------------------*/
2934 /*@
2935    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2936    (the default parallel PETSc format). This matrix will ultimately pushed down
2937    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2938    assembly performance the user should preallocate the matrix storage by setting
2939    the parameter nz (or the array nnz).  By setting these parameters accurately,
2940    performance during matrix assembly can be increased by more than a factor of 50.
2941 
2942    Collective
2943 
2944    Input Parameters:
2945 +  comm - MPI communicator, set to PETSC_COMM_SELF
2946 .  m - number of rows
2947 .  n - number of columns
2948 .  nz - number of nonzeros per row (same for all rows)
2949 -  nnz - array containing the number of nonzeros in the various rows
2950          (possibly different for each row) or NULL
2951 
2952    Output Parameter:
2953 .  A - the matrix
2954 
2955    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2956    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2957    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2958 
2959    Notes:
2960    If nnz is given then nz is ignored
2961 
2962    The AIJ format (also called the Yale sparse matrix format or
2963    compressed row storage), is fully compatible with standard Fortran 77
2964    storage.  That is, the stored row and column indices can begin at
2965    either one (as in Fortran) or zero.  See the users' manual for details.
2966 
2967    Specify the preallocated storage with either nz or nnz (not both).
2968    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2969    allocation.  For large problems you MUST preallocate memory or you
2970    will get TERRIBLE performance, see the users' manual chapter on matrices.
2971 
2972    By default, this format uses inodes (identical nodes) when possible, to
2973    improve numerical efficiency of matrix-vector products and solves. We
2974    search for consecutive rows with the same nonzero structure, thereby
2975    reusing matrix information to achieve increased efficiency.
2976 
2977    Level: intermediate
2978 
2979 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2980 @*/
2981 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2982 {
2983   PetscErrorCode ierr;
2984 
2985   PetscFunctionBegin;
2986   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2987   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2988   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2989   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2990   PetscFunctionReturn(0);
2991 }
2992 
2993 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2994 {
2995   PetscErrorCode              ierr;
2996   PetscSplitCSRDataStructure  *d_mat = NULL;
2997 
2998   PetscFunctionBegin;
2999   if (A->factortype == MAT_FACTOR_NONE) {
3000     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
3001     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3002     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3003   } else {
3004     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3005   }
3006   if (d_mat) {
3007     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3008     cudaError_t                err;
3009     PetscSplitCSRDataStructure h_mat;
3010     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3011     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3012     if (a->compressedrow.use) {
3013       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3014     }
3015     err = cudaFree(d_mat);CHKERRCUDA(err);
3016   }
3017   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3018   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3019   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3020   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3021   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3022   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3023   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3024   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3025   PetscFunctionReturn(0);
3026 }
3027 
3028 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3029 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3030 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3031 {
3032   PetscErrorCode ierr;
3033 
3034   PetscFunctionBegin;
3035   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3036   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3037   PetscFunctionReturn(0);
3038 }
3039 
3040 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
3041 {
3042   PetscErrorCode ierr;
3043   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3044   PetscBool      flgx,flgy;
3045 
3046   PetscFunctionBegin;
3047   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
3048   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
3049   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
3050   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
3051   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
3052   if (!flgx || !flgy) {
3053     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3054     PetscFunctionReturn(0);
3055   }
3056   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
3057   if (str == DIFFERENT_NONZERO_PATTERN) {
3058     if (x->nz == y->nz) {
3059       PetscBool e;
3060       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
3061       if (e) {
3062         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
3063         if (e) {
3064           str = SAME_NONZERO_PATTERN;
3065         }
3066       }
3067     }
3068   }
3069   if (str != SAME_NONZERO_PATTERN) {
3070     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3071     PetscFunctionReturn(0);
3072   } else {
3073     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3074     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
3075     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3076     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3077     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
3078       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
3079         ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3080       }
3081       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
3082         ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
3083       }
3084       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3085       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3086       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3087     } else {
3088       cublasHandle_t cublasv2handle;
3089       cublasStatus_t cberr;
3090       cudaError_t    err;
3091       PetscScalar    alpha = a;
3092       PetscBLASInt   one = 1, bnz = 1;
3093       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
3094       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
3095       PetscScalar    *aa_y, *aa_x;
3096       aa_y = matrix_y->values->data().get();
3097       aa_x = matrix_x->values->data().get();
3098       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3099       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3100       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3101       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
3102       err  = WaitForCUDA();CHKERRCUDA(err);
3103       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3104       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3105       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3106       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
3107       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
3108       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
3109       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3110     }
3111   }
3112   PetscFunctionReturn(0);
3113 }
3114 
3115 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3116 {
3117   PetscErrorCode             ierr;
3118   PetscBool                  both = PETSC_FALSE;
3119   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3120 
3121   PetscFunctionBegin;
3122   if (A->factortype == MAT_FACTOR_NONE) {
3123     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3124     if (spptr->mat) {
3125       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3126       if (matrix->values) {
3127         both = PETSC_TRUE;
3128         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3129       }
3130     }
3131     if (spptr->matTranspose) {
3132       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3133       if (matrix->values) {
3134         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3135       }
3136     }
3137   }
3138   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3139   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3140   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3141   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3142   else A->offloadmask = PETSC_OFFLOAD_CPU;
3143 
3144   PetscFunctionReturn(0);
3145 }
3146 
3147 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3148 {
3149   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3150   PetscErrorCode ierr;
3151 
3152   PetscFunctionBegin;
3153   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3154   if (flg) {
3155     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3156 
3157     A->ops->axpy                      = MatAXPY_SeqAIJ;
3158     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3159     A->ops->mult                      = MatMult_SeqAIJ;
3160     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3161     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3162     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3163     A->ops->multhermitiantranspose    = NULL;
3164     A->ops->multhermitiantransposeadd = NULL;
3165     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3166     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3167     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3168     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3169     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3170     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3171     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3172   } else {
3173     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3174     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3175     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3176     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3177     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3178     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3179     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3180     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3181     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3182     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3183     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3184     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3185     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3186     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3187     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3188   }
3189   A->boundtocpu = flg;
3190   a->inode.use = flg;
3191   PetscFunctionReturn(0);
3192 }
3193 
3194 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3195 {
3196   PetscErrorCode   ierr;
3197   cusparseStatus_t stat;
3198   Mat              B;
3199 
3200   PetscFunctionBegin;
3201   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3202   if (reuse == MAT_INITIAL_MATRIX) {
3203     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3204   } else if (reuse == MAT_REUSE_MATRIX) {
3205     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3206   }
3207   B = *newmat;
3208 
3209   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3210   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3211 
3212   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3213     if (B->factortype == MAT_FACTOR_NONE) {
3214       Mat_SeqAIJCUSPARSE *spptr;
3215 
3216       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3217       spptr->format = MAT_CUSPARSE_CSR;
3218       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3219       B->spptr = spptr;
3220       spptr->deviceMat = NULL;
3221     } else {
3222       Mat_SeqAIJCUSPARSETriFactors *spptr;
3223 
3224       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3225       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3226       B->spptr = spptr;
3227     }
3228     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3229   }
3230   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3231   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3232   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3233   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3234   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3235 
3236   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3237   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3238   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3239   PetscFunctionReturn(0);
3240 }
3241 
3242 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3243 {
3244   PetscErrorCode ierr;
3245 
3246   PetscFunctionBegin;
3247   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3248   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3249   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3250   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3251   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3252   PetscFunctionReturn(0);
3253 }
3254 
3255 /*MC
3256    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3257 
3258    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3259    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3260    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3261 
3262    Options Database Keys:
3263 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3264 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3265 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3266 
3267   Level: beginner
3268 
3269 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3270 M*/
3271 
3272 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3273 
3274 
3275 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3276 {
3277   PetscErrorCode ierr;
3278 
3279   PetscFunctionBegin;
3280   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3281   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3282   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3283   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3284   PetscFunctionReturn(0);
3285 }
3286 
3287 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3288 {
3289   PetscErrorCode   ierr;
3290   cusparseStatus_t stat;
3291 
3292   PetscFunctionBegin;
3293   if (*cusparsestruct) {
3294     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3295     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3296     delete (*cusparsestruct)->workVector;
3297     delete (*cusparsestruct)->rowoffsets_gpu;
3298     delete (*cusparsestruct)->cooPerm;
3299     delete (*cusparsestruct)->cooPerm_a;
3300     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3301    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3302     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3303    #endif
3304     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3305   }
3306   PetscFunctionReturn(0);
3307 }
3308 
3309 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3310 {
3311   PetscFunctionBegin;
3312   if (*mat) {
3313     delete (*mat)->values;
3314     delete (*mat)->column_indices;
3315     delete (*mat)->row_offsets;
3316     delete *mat;
3317     *mat = 0;
3318   }
3319   PetscFunctionReturn(0);
3320 }
3321 
3322 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3323 {
3324   cusparseStatus_t stat;
3325   PetscErrorCode   ierr;
3326 
3327   PetscFunctionBegin;
3328   if (*trifactor) {
3329     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3330     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3331     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3332     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3333     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3334    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3335     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3336    #endif
3337     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3338   }
3339   PetscFunctionReturn(0);
3340 }
3341 
3342 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3343 {
3344   CsrMatrix        *mat;
3345   cusparseStatus_t stat;
3346   cudaError_t      err;
3347 
3348   PetscFunctionBegin;
3349   if (*matstruct) {
3350     if ((*matstruct)->mat) {
3351       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3352        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3353         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3354        #else
3355         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3356         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3357        #endif
3358       } else {
3359         mat = (CsrMatrix*)(*matstruct)->mat;
3360         CsrMatrix_Destroy(&mat);
3361       }
3362     }
3363     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3364     delete (*matstruct)->cprowIndices;
3365     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3366     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3367     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3368 
3369    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3370     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3371     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3372     for (int i=0; i<3; i++) {
3373       if (mdata->cuSpMV[i].initialized) {
3374         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3375         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3376         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3377       }
3378     }
3379    #endif
3380     delete *matstruct;
3381     *matstruct = NULL;
3382   }
3383   PetscFunctionReturn(0);
3384 }
3385 
3386 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3387 {
3388   PetscErrorCode ierr;
3389 
3390   PetscFunctionBegin;
3391   if (*trifactors) {
3392     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3393     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3394     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3395     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3396     delete (*trifactors)->rpermIndices;
3397     delete (*trifactors)->cpermIndices;
3398     delete (*trifactors)->workVector;
3399     (*trifactors)->rpermIndices = NULL;
3400     (*trifactors)->cpermIndices = NULL;
3401     (*trifactors)->workVector = NULL;
3402   }
3403   PetscFunctionReturn(0);
3404 }
3405 
3406 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3407 {
3408   PetscErrorCode   ierr;
3409   cusparseHandle_t handle;
3410   cusparseStatus_t stat;
3411 
3412   PetscFunctionBegin;
3413   if (*trifactors) {
3414     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3415     if (handle = (*trifactors)->handle) {
3416       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3417     }
3418     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3419   }
3420   PetscFunctionReturn(0);
3421 }
3422 
3423 struct IJCompare
3424 {
3425   __host__ __device__
3426   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3427   {
3428     if (t1.get<0>() < t2.get<0>()) return true;
3429     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3430     return false;
3431   }
3432 };
3433 
3434 struct IJEqual
3435 {
3436   __host__ __device__
3437   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3438   {
3439     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3440     return true;
3441   }
3442 };
3443 
3444 struct IJDiff
3445 {
3446   __host__ __device__
3447   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3448   {
3449     return t1 == t2 ? 0 : 1;
3450   }
3451 };
3452 
3453 struct IJSum
3454 {
3455   __host__ __device__
3456   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3457   {
3458     return t1||t2;
3459   }
3460 };
3461 
3462 #include <thrust/iterator/discard_iterator.h>
3463 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3464 {
3465   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3466   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3467   THRUSTARRAY                           *cooPerm_v = NULL,*cooPerm_w;
3468   thrust::device_ptr<const PetscScalar> d_v;
3469   CsrMatrix                             *matrix;
3470   PetscErrorCode                        ierr;
3471   cudaError_t                           cerr;
3472   PetscInt                              n;
3473 
3474   PetscFunctionBegin;
3475   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3476   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3477   if (!cusp->cooPerm) {
3478     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3479     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3480     PetscFunctionReturn(0);
3481   }
3482   matrix = (CsrMatrix*)cusp->mat->mat;
3483   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3484   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3485   if (!v) {
3486     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3487     goto finalize;
3488   }
3489   n = cusp->cooPerm->size();
3490   if (isCudaMem(v)) {
3491     d_v = thrust::device_pointer_cast(v);
3492   } else {
3493     cooPerm_v = new THRUSTARRAY(n);
3494     cooPerm_v->assign(v,v+n);
3495     d_v = cooPerm_v->data();
3496     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3497   }
3498   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3499     if (cusp->cooPerm_a) {
3500       cooPerm_w = new THRUSTARRAY(matrix->values->size());
3501       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3502       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3503       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3504       delete cooPerm_w;
3505     } else {
3506       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3507                                                                 matrix->values->begin()));
3508       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3509                                                                 matrix->values->end()));
3510       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3511     }
3512   } else {
3513     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3514       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3515       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3516     } else {
3517       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3518                                                                 matrix->values->begin()));
3519       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3520                                                                 matrix->values->end()));
3521       thrust::for_each(zibit,zieit,VecCUDAEquals());
3522     }
3523   }
3524   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3525   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3526 finalize:
3527   delete cooPerm_v;
3528   A->offloadmask = PETSC_OFFLOAD_GPU;
3529   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3530   /* shorter version of MatAssemblyEnd_SeqAIJ */
3531   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3532   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3533   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3534   a->reallocs         = 0;
3535   A->info.mallocs    += 0;
3536   A->info.nz_unneeded = 0;
3537   A->assembled = A->was_assembled = PETSC_TRUE;
3538   A->num_ass++;
3539   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
3540   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3541   PetscFunctionReturn(0);
3542 }
3543 
3544 #include <thrust/binary_search.h>
3545 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3546 {
3547   PetscErrorCode     ierr;
3548   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3549   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3550   PetscInt           cooPerm_n, nzr = 0;
3551   cudaError_t        cerr;
3552 
3553   PetscFunctionBegin;
3554   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3555   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3556   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3557   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3558   if (n != cooPerm_n) {
3559     delete cusp->cooPerm;
3560     delete cusp->cooPerm_a;
3561     cusp->cooPerm = NULL;
3562     cusp->cooPerm_a = NULL;
3563   }
3564   if (n) {
3565     THRUSTINTARRAY d_i(n);
3566     THRUSTINTARRAY d_j(n);
3567     THRUSTINTARRAY ii(A->rmap->n);
3568 
3569     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3570     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3571 
3572     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3573     d_i.assign(coo_i,coo_i+n);
3574     d_j.assign(coo_j,coo_j+n);
3575     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3576     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3577 
3578     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3579     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3580     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3581     *cusp->cooPerm_a = d_i;
3582     THRUSTINTARRAY w = d_j;
3583 
3584     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3585     if (nekey == ekey) { /* all entries are unique */
3586       delete cusp->cooPerm_a;
3587       cusp->cooPerm_a = NULL;
3588     } else { /* I couldn't come up with a more elegant algorithm */
3589       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3590       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3591       (*cusp->cooPerm_a)[0] = 0;
3592       w[0] = 0;
3593       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3594       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3595     }
3596     thrust::counting_iterator<PetscInt> search_begin(0);
3597     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3598                         search_begin, search_begin + A->rmap->n,
3599                         ii.begin());
3600     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3601     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3602 
3603     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3604     a->singlemalloc = PETSC_FALSE;
3605     a->free_a       = PETSC_TRUE;
3606     a->free_ij      = PETSC_TRUE;
3607     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3608     a->i[0] = 0;
3609     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3610     a->nz = a->maxnz = a->i[A->rmap->n];
3611     a->rmax = 0;
3612     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3613     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3614     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3615     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3616     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3617     for (PetscInt i = 0; i < A->rmap->n; i++) {
3618       const PetscInt nnzr = a->i[i+1] - a->i[i];
3619       nzr += (PetscInt)!!(nnzr);
3620       a->ilen[i] = a->imax[i] = nnzr;
3621       a->rmax = PetscMax(a->rmax,nnzr);
3622     }
3623     a->nonzerorowcnt = nzr;
3624     A->preallocated = PETSC_TRUE;
3625     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3626     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3627   } else {
3628     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3629   }
3630   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3631   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3632 
3633   /* We want to allocate the CUSPARSE struct for matvec now.
3634      The code is so convoluted now that I prefer to copy zeros */
3635   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3636   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3637   A->offloadmask = PETSC_OFFLOAD_CPU;
3638   A->nonzerostate++;
3639   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3640   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3641 
3642   A->assembled = PETSC_FALSE;
3643   A->was_assembled = PETSC_FALSE;
3644   PetscFunctionReturn(0);
3645 }
3646 
3647 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3648 {
3649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3650   CsrMatrix          *csr;
3651   PetscErrorCode     ierr;
3652 
3653   PetscFunctionBegin;
3654   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3655   PetscValidPointer(a,2);
3656   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3657   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3658   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3659   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3660   csr = (CsrMatrix*)cusp->mat->mat;
3661   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3662   *a = csr->values->data().get();
3663   PetscFunctionReturn(0);
3664 }
3665 
3666 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3667 {
3668   PetscFunctionBegin;
3669   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3670   PetscValidPointer(a,2);
3671   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3672   *a = NULL;
3673   PetscFunctionReturn(0);
3674 }
3675 
3676 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3677 {
3678   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3679   CsrMatrix          *csr;
3680 
3681   PetscFunctionBegin;
3682   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3683   PetscValidPointer(a,2);
3684   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3685   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3686   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3687   csr = (CsrMatrix*)cusp->mat->mat;
3688   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3689   *a = csr->values->data().get();
3690   PetscFunctionReturn(0);
3691 }
3692 
3693 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3694 {
3695   PetscErrorCode ierr;
3696 
3697   PetscFunctionBegin;
3698   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3699   PetscValidPointer(a,2);
3700   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3701   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3702   A->offloadmask = PETSC_OFFLOAD_GPU;
3703   *a = NULL;
3704   PetscFunctionReturn(0);
3705 }
3706 
3707 struct IJCompare4
3708 {
3709   __host__ __device__
3710   inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2)
3711   {
3712     if (t1.get<0>() < t2.get<0>()) return true;
3713     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3714     return false;
3715   }
3716 };
3717 
3718 struct Shift
3719 {
3720   int _shift;
3721 
3722   Shift(int shift) : _shift(shift) {}
3723   __host__ __device__
3724   inline int operator() (const int &c)
3725   {
3726     return c + _shift;
3727   }
3728 };
3729 
3730 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3731 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3732 {
3733   PetscErrorCode               ierr;
3734   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3735   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3736   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3737   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3738   PetscInt                     Annz,Bnnz;
3739   cusparseStatus_t             stat;
3740   PetscInt                     i,m,n,zero = 0;
3741   cudaError_t                  cerr;
3742 
3743   PetscFunctionBegin;
3744   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3745   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3746   PetscValidPointer(C,4);
3747   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3748   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3749   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3750   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3751   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3752   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3753   if (reuse == MAT_INITIAL_MATRIX) {
3754     m     = A->rmap->n;
3755     n     = A->cmap->n + B->cmap->n;
3756     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3757     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3758     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3759     c     = (Mat_SeqAIJ*)(*C)->data;
3760     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3761     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3762     Ccsr  = new CsrMatrix;
3763     Cmat->cprowIndices      = NULL;
3764     c->compressedrow.use    = PETSC_FALSE;
3765     c->compressedrow.nrows  = 0;
3766     c->compressedrow.i      = NULL;
3767     c->compressedrow.rindex = NULL;
3768     Ccusp->workVector       = NULL;
3769     Ccusp->nrows    = m;
3770     Ccusp->mat      = Cmat;
3771     Ccusp->mat->mat = Ccsr;
3772     Ccsr->num_rows  = m;
3773     Ccsr->num_cols  = n;
3774     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3775     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3776     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3777     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3778     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3779     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3780     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3781     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3782     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3783     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3784     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3785     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3786     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3787     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3788     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3789 
3790     Acsr = (CsrMatrix*)Acusp->mat->mat;
3791     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3792     Annz = (PetscInt)Acsr->column_indices->size();
3793     Bnnz = (PetscInt)Bcsr->column_indices->size();
3794     c->nz = Annz + Bnnz;
3795     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3796     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3797     Ccsr->values = new THRUSTARRAY(c->nz);
3798     Ccsr->num_entries = c->nz;
3799     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3800 
3801     if (c->nz) {
3802       THRUSTINTARRAY32 Acoo(Annz);
3803       THRUSTINTARRAY32 Bcoo(Bnnz);
3804       THRUSTINTARRAY32 *roff;
3805       if (a->compressedrow.use) { /* need full row offset */
3806         if (!Acusp->rowoffsets_gpu) {
3807           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3808           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3809           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3810         }
3811         roff = Acusp->rowoffsets_gpu;
3812       } else roff = Acsr->row_offsets;
3813       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3814       stat = cusparseXcsr2coo(Acusp->handle,
3815                               roff->data().get(),
3816                               Annz,
3817                               m,
3818                               Acoo.data().get(),
3819                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3820       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3821       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3822       if (b->compressedrow.use) { /* need full row offset */
3823         if (!Bcusp->rowoffsets_gpu) {
3824           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3825           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3826           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3827         }
3828         roff = Bcusp->rowoffsets_gpu;
3829       } else roff = Bcsr->row_offsets;
3830       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3831       stat = cusparseXcsr2coo(Bcusp->handle,
3832                               roff->data().get(),
3833                               Bnnz,
3834                               m,
3835                               Bcoo.data().get(),
3836                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3837       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3838       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3839       THRUSTINTARRAY32 Ccoo(c->nz);
3840       auto Aperm = thrust::make_constant_iterator(true);
3841       auto Bperm = thrust::make_constant_iterator(false);
3842       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3843       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3844       thrust::device_vector<bool> wPerm(Annz+Bnnz);
3845       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3846       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3847       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm));
3848       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm));
3849       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin()));
3850       auto p1 = Ccusp->cooPerm->begin();
3851       auto p2 = Ccusp->cooPerm->begin();
3852       thrust::advance(p2,Annz);
3853       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3854       thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4());
3855       thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>());
3856       stat = cusparseXcoo2csr(Ccusp->handle,
3857                               Ccoo.data().get(),
3858                               c->nz,
3859                               m,
3860                               Ccsr->row_offsets->data().get(),
3861                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3862       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3863       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3864 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3865       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3866                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3867                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3868                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3869 #endif
3870       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3871         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3872         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3873         CsrMatrix *CcsrT = new CsrMatrix;
3874         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3875         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3876 
3877         Ccusp->transgen = PETSC_TRUE;
3878         CmatT->cprowIndices  = NULL;
3879         CmatT->mat = CcsrT;
3880         CcsrT->num_rows = n;
3881         CcsrT->num_cols = m;
3882         CcsrT->num_entries = c->nz;
3883 
3884         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3885         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3886         CcsrT->values = new THRUSTARRAY(c->nz);
3887 
3888         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3889         auto rT = CcsrT->row_offsets->begin();
3890         if (AT) {
3891           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3892           thrust::advance(rT,-1);
3893         }
3894         if (BT) {
3895           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3896           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3897           thrust::copy(titb,tite,rT);
3898         }
3899         auto cT = CcsrT->column_indices->begin();
3900         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3901         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3902         auto vT = CcsrT->values->begin();
3903         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3904         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3905         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3906         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3907 
3908         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3909         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3910         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3911         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3912         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3913         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3914         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3915         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3916         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3917 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3918         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3919                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3920                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3921                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3922 #endif
3923         Ccusp->matTranspose = CmatT;
3924       }
3925     }
3926 
3927     c->singlemalloc = PETSC_FALSE;
3928     c->free_a       = PETSC_TRUE;
3929     c->free_ij      = PETSC_TRUE;
3930     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
3931     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
3932     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3933       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3934       THRUSTINTARRAY jj(Ccsr->column_indices->size());
3935       ii   = *Ccsr->row_offsets;
3936       jj   = *Ccsr->column_indices;
3937       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3938       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3939     } else {
3940       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3941       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3942     }
3943     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
3944     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
3945     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
3946     c->maxnz = c->nz;
3947     c->nonzerorowcnt = 0;
3948     c->rmax = 0;
3949     for (i = 0; i < m; i++) {
3950       const PetscInt nn = c->i[i+1] - c->i[i];
3951       c->ilen[i] = c->imax[i] = nn;
3952       c->nonzerorowcnt += (PetscInt)!!nn;
3953       c->rmax = PetscMax(c->rmax,nn);
3954     }
3955     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
3956     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
3957     (*C)->nonzerostate++;
3958     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
3959     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
3960     Ccusp->nonzerostate = (*C)->nonzerostate;
3961     (*C)->preallocated  = PETSC_TRUE;
3962   } else {
3963     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
3964     c = (Mat_SeqAIJ*)(*C)->data;
3965     if (c->nz) {
3966       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3967       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
3968       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3969       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
3970       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3971       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3972       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3973       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3974       Acsr = (CsrMatrix*)Acusp->mat->mat;
3975       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3976       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
3977       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
3978       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
3979       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
3980       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
3981       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
3982       auto pmid = Ccusp->cooPerm->begin();
3983       thrust::advance(pmid,Acsr->num_entries);
3984       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3985       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
3986                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
3987       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
3988                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3989       thrust::for_each(zibait,zieait,VecCUDAEquals());
3990       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
3991                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3992       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
3993                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
3994       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
3995       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
3996         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3997         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3998         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3999         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4000         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4001         auto vT = CcsrT->values->begin();
4002         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4003         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4004       }
4005       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4006       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4007     }
4008   }
4009   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4010   (*C)->assembled     = PETSC_TRUE;
4011   (*C)->was_assembled = PETSC_FALSE;
4012   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4013   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
4014   ierr = MatSeqAIJCUSPARSECopyFromGPU(*C);CHKERRQ(ierr);
4015   PetscFunctionReturn(0);
4016 }
4017