xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 6a29ce69e29aaf78b7325cac1ae663c7f0b163da)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
84 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85 
86 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
87 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
88 
89 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
90 {
91   cusparseStatus_t   stat;
92   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
93 
94   PetscFunctionBegin;
95   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
96   cusparsestruct->stream = stream;
97   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
98   PetscFunctionReturn(0);
99 }
100 
101 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
102 {
103   cusparseStatus_t   stat;
104   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
105 
106   PetscFunctionBegin;
107   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
108   if (cusparsestruct->handle != handle) {
109     if (cusparsestruct->handle) {
110       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
111     }
112     cusparsestruct->handle = handle;
113   }
114   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
115   PetscFunctionReturn(0);
116 }
117 
118 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
119 {
120   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
121   PetscBool          flg;
122   PetscErrorCode     ierr;
123 
124   PetscFunctionBegin;
125   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
126   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
127   if (cusparsestruct->handle) cusparsestruct->handle = 0;
128   PetscFunctionReturn(0);
129 }
130 
131 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
132 {
133   PetscFunctionBegin;
134   *type = MATSOLVERCUSPARSE;
135   PetscFunctionReturn(0);
136 }
137 
138 /*MC
139   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
140   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
141   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
142   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
143   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
144   algorithms are not recommended. This class does NOT support direct solver operations.
145 
146   Level: beginner
147 
148 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
149 M*/
150 
151 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
152 {
153   PetscErrorCode ierr;
154   PetscInt       n = A->rmap->n;
155 
156   PetscFunctionBegin;
157   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
158   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
159   (*B)->factortype = ftype;
160   (*B)->useordering = PETSC_TRUE;
161   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
162 
163   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
164     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
165     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
167   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
168     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
169     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
170   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
171 
172   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
173   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
178 {
179   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
180 
181   PetscFunctionBegin;
182   switch (op) {
183   case MAT_CUSPARSE_MULT:
184     cusparsestruct->format = format;
185     break;
186   case MAT_CUSPARSE_ALL:
187     cusparsestruct->format = format;
188     break;
189   default:
190     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*@
196    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
197    operation. Only the MatMult operation can use different GPU storage formats
198    for MPIAIJCUSPARSE matrices.
199    Not Collective
200 
201    Input Parameters:
202 +  A - Matrix of type SEQAIJCUSPARSE
203 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
204 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
205 
206    Output Parameter:
207 
208    Level: intermediate
209 
210 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
211 @*/
212 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
213 {
214   PetscErrorCode ierr;
215 
216   PetscFunctionBegin;
217   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
218   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
219   PetscFunctionReturn(0);
220 }
221 
222 /*@
223    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
224 
225    Collective on mat
226 
227    Input Parameters:
228 +  A - Matrix of type SEQAIJCUSPARSE
229 -  transgen - the boolean flag
230 
231    Level: intermediate
232 
233 .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
234 @*/
235 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
236 {
237   PetscErrorCode ierr;
238   PetscBool      flg;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
243   if (flg) {
244     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
245 
246     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
247     cusp->transgen = transgen;
248     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
249       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
250     }
251   }
252   PetscFunctionReturn(0);
253 }
254 
255 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
256 {
257   PetscErrorCode           ierr;
258   MatCUSPARSEStorageFormat format;
259   PetscBool                flg;
260   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
261 
262   PetscFunctionBegin;
263   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
264   if (A->factortype == MAT_FACTOR_NONE) {
265     PetscBool transgen = cusparsestruct->transgen;
266 
267     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
268     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
269 
270     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
271                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
273 
274     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
275                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
277    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
278     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
279     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
280                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
281     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
282     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
283 
284     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
285     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
286                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
287     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
288 
289     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
290     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
291                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
292     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
293    #endif
294   }
295   ierr = PetscOptionsTail();CHKERRQ(ierr);
296   PetscFunctionReturn(0);
297 }
298 
299 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
300 {
301   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
302   PetscErrorCode               ierr;
303 
304   PetscFunctionBegin;
305   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
306   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
307   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
308   PetscFunctionReturn(0);
309 }
310 
311 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
312 {
313   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
314   PetscErrorCode               ierr;
315 
316   PetscFunctionBegin;
317   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
318   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
319   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
324 {
325   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326   PetscErrorCode               ierr;
327 
328   PetscFunctionBegin;
329   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332   PetscFunctionReturn(0);
333 }
334 
335 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
336 {
337   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338   PetscErrorCode               ierr;
339 
340   PetscFunctionBegin;
341   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
343   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
344   PetscFunctionReturn(0);
345 }
346 
347 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
348 {
349   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
350   PetscInt                          n = A->rmap->n;
351   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
353   cusparseStatus_t                  stat;
354   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
355   const MatScalar                   *aa = a->a,*v;
356   PetscInt                          *AiLo, *AjLo;
357   PetscInt                          i,nz, nzLower, offset, rowOffset;
358   PetscErrorCode                    ierr;
359   cudaError_t                       cerr;
360 
361   PetscFunctionBegin;
362   if (!n) PetscFunctionReturn(0);
363   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
364     try {
365       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
366       nzLower=n+ai[n]-ai[1];
367       if (!loTriFactor) {
368         PetscScalar                       *AALo;
369 
370         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
371 
372         /* Allocate Space for the lower triangular matrix */
373         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
374         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
375 
376         /* Fill the lower triangular matrix */
377         AiLo[0]  = (PetscInt) 0;
378         AiLo[n]  = nzLower;
379         AjLo[0]  = (PetscInt) 0;
380         AALo[0]  = (MatScalar) 1.0;
381         v        = aa;
382         vi       = aj;
383         offset   = 1;
384         rowOffset= 1;
385         for (i=1; i<n; i++) {
386           nz = ai[i+1] - ai[i];
387           /* additional 1 for the term on the diagonal */
388           AiLo[i]    = rowOffset;
389           rowOffset += nz+1;
390 
391           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
392           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
393 
394           offset      += nz;
395           AjLo[offset] = (PetscInt) i;
396           AALo[offset] = (MatScalar) 1.0;
397           offset      += 1;
398 
399           v  += nz;
400           vi += nz;
401         }
402 
403         /* allocate space for the triangular factor information */
404         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
405         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
406         /* Create the matrix description */
407         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
408         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
409        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
410         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
411        #else
412         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
413        #endif
414         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
415         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
416 
417         /* set the operation */
418         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
419 
420         /* set the matrix */
421         loTriFactor->csrMat = new CsrMatrix;
422         loTriFactor->csrMat->num_rows = n;
423         loTriFactor->csrMat->num_cols = n;
424         loTriFactor->csrMat->num_entries = nzLower;
425 
426         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
427         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
428 
429         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
430         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
431 
432         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
433         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
434 
435         /* Create the solve analysis information */
436         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
437         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
438       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
439         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
440                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
441                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
442                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
443                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
444         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
445       #endif
446 
447         /* perform the solve analysis */
448         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
449                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
450                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
451                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
452                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
453                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
454                                #endif
455 );CHKERRCUSPARSE(stat);
456         cerr = WaitForCUDA();CHKERRCUDA(cerr);
457         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458 
459         /* assign the pointer */
460         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
461         loTriFactor->AA_h = AALo;
462         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
463         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
464         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
465       } else { /* update values only */
466         if (!loTriFactor->AA_h) {
467           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
468         }
469         /* Fill the lower triangular matrix */
470         loTriFactor->AA_h[0]  = 1.0;
471         v        = aa;
472         vi       = aj;
473         offset   = 1;
474         for (i=1; i<n; i++) {
475           nz = ai[i+1] - ai[i];
476           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
477           offset      += nz;
478           loTriFactor->AA_h[offset] = 1.0;
479           offset      += 1;
480           v  += nz;
481         }
482         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
483         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484       }
485     } catch(char *ex) {
486       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
487     }
488   }
489   PetscFunctionReturn(0);
490 }
491 
492 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
493 {
494   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
495   PetscInt                          n = A->rmap->n;
496   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
497   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
498   cusparseStatus_t                  stat;
499   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
500   const MatScalar                   *aa = a->a,*v;
501   PetscInt                          *AiUp, *AjUp;
502   PetscInt                          i,nz, nzUpper, offset;
503   PetscErrorCode                    ierr;
504   cudaError_t                       cerr;
505 
506   PetscFunctionBegin;
507   if (!n) PetscFunctionReturn(0);
508   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
509     try {
510       /* next, figure out the number of nonzeros in the upper triangular matrix. */
511       nzUpper = adiag[0]-adiag[n];
512       if (!upTriFactor) {
513         PetscScalar *AAUp;
514 
515         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
516 
517         /* Allocate Space for the upper triangular matrix */
518         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
519         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
520 
521         /* Fill the upper triangular matrix */
522         AiUp[0]=(PetscInt) 0;
523         AiUp[n]=nzUpper;
524         offset = nzUpper;
525         for (i=n-1; i>=0; i--) {
526           v  = aa + adiag[i+1] + 1;
527           vi = aj + adiag[i+1] + 1;
528 
529           /* number of elements NOT on the diagonal */
530           nz = adiag[i] - adiag[i+1]-1;
531 
532           /* decrement the offset */
533           offset -= (nz+1);
534 
535           /* first, set the diagonal elements */
536           AjUp[offset] = (PetscInt) i;
537           AAUp[offset] = (MatScalar)1./v[nz];
538           AiUp[i]      = AiUp[i+1] - (nz+1);
539 
540           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
541           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
542         }
543 
544         /* allocate space for the triangular factor information */
545         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
546         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
547 
548         /* Create the matrix description */
549         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
550         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
551        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
552         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
553        #else
554         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
555        #endif
556         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
557         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
558 
559         /* set the operation */
560         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
561 
562         /* set the matrix */
563         upTriFactor->csrMat = new CsrMatrix;
564         upTriFactor->csrMat->num_rows = n;
565         upTriFactor->csrMat->num_cols = n;
566         upTriFactor->csrMat->num_entries = nzUpper;
567 
568         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
569         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
570 
571         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
572         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
573 
574         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
575         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
576 
577         /* Create the solve analysis information */
578         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
579         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
580       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
581         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
582                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
585                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
586         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
587       #endif
588 
589         /* perform the solve analysis */
590         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
594                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
595                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
596                                #endif
597 );CHKERRCUSPARSE(stat);
598         cerr = WaitForCUDA();CHKERRCUDA(cerr);
599         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
600 
601         /* assign the pointer */
602         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
603         upTriFactor->AA_h = AAUp;
604         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
605         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
606         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
607       } else {
608         if (!upTriFactor->AA_h) {
609           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
610         }
611         /* Fill the upper triangular matrix */
612         offset = nzUpper;
613         for (i=n-1; i>=0; i--) {
614           v  = aa + adiag[i+1] + 1;
615 
616           /* number of elements NOT on the diagonal */
617           nz = adiag[i] - adiag[i+1]-1;
618 
619           /* decrement the offset */
620           offset -= (nz+1);
621 
622           /* first, set the diagonal elements */
623           upTriFactor->AA_h[offset] = 1./v[nz];
624           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
625         }
626         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
627         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
628       }
629     } catch(char *ex) {
630       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
631     }
632   }
633   PetscFunctionReturn(0);
634 }
635 
636 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
637 {
638   PetscErrorCode               ierr;
639   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
640   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
641   IS                           isrow = a->row,iscol = a->icol;
642   PetscBool                    row_identity,col_identity;
643   PetscInt                     n = A->rmap->n;
644 
645   PetscFunctionBegin;
646   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
647   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
648   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
649 
650   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
651   cusparseTriFactors->nnz=a->nz;
652 
653   A->offloadmask = PETSC_OFFLOAD_BOTH;
654   /* lower triangular indices */
655   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
656   if (!row_identity && !cusparseTriFactors->rpermIndices) {
657     const PetscInt *r;
658 
659     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
660     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661     cusparseTriFactors->rpermIndices->assign(r, r+n);
662     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
663     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
664   }
665 
666   /* upper triangular indices */
667   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
668   if (!col_identity && !cusparseTriFactors->cpermIndices) {
669     const PetscInt *c;
670 
671     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
672     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
673     cusparseTriFactors->cpermIndices->assign(c, c+n);
674     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
675     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
681 {
682   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
683   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
685   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
686   cusparseStatus_t                  stat;
687   PetscErrorCode                    ierr;
688   cudaError_t                       cerr;
689   PetscInt                          *AiUp, *AjUp;
690   PetscScalar                       *AAUp;
691   PetscScalar                       *AALo;
692   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
693   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
694   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
695   const MatScalar                   *aa = b->a,*v;
696 
697   PetscFunctionBegin;
698   if (!n) PetscFunctionReturn(0);
699   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
700     try {
701       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
702       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
703       if (!upTriFactor && !loTriFactor) {
704         /* Allocate Space for the upper triangular matrix */
705         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
706         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
707 
708         /* Fill the upper triangular matrix */
709         AiUp[0]=(PetscInt) 0;
710         AiUp[n]=nzUpper;
711         offset = 0;
712         for (i=0; i<n; i++) {
713           /* set the pointers */
714           v  = aa + ai[i];
715           vj = aj + ai[i];
716           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
717 
718           /* first, set the diagonal elements */
719           AjUp[offset] = (PetscInt) i;
720           AAUp[offset] = (MatScalar)1.0/v[nz];
721           AiUp[i]      = offset;
722           AALo[offset] = (MatScalar)1.0/v[nz];
723 
724           offset+=1;
725           if (nz>0) {
726             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
727             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
728             for (j=offset; j<offset+nz; j++) {
729               AAUp[j] = -AAUp[j];
730               AALo[j] = AAUp[j]/v[nz];
731             }
732             offset+=nz;
733           }
734         }
735 
736         /* allocate space for the triangular factor information */
737         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
738         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
739 
740         /* Create the matrix description */
741         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
742         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
743        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
744         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
745        #else
746         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
747        #endif
748         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
749         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
750 
751         /* set the matrix */
752         upTriFactor->csrMat = new CsrMatrix;
753         upTriFactor->csrMat->num_rows = A->rmap->n;
754         upTriFactor->csrMat->num_cols = A->cmap->n;
755         upTriFactor->csrMat->num_entries = a->nz;
756 
757         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
758         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
759 
760         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
761         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
762 
763         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
764         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
765 
766         /* set the operation */
767         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768 
769         /* Create the solve analysis information */
770         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
771         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
772       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
773         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
774                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
775                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
776                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
777                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
778         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
779       #endif
780 
781         /* perform the solve analysis */
782         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
783                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
784                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
785                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
786                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
787                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
788                                 #endif
789 );CHKERRCUSPARSE(stat);
790         cerr = WaitForCUDA();CHKERRCUDA(cerr);
791         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
792 
793         /* assign the pointer */
794         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
795 
796         /* allocate space for the triangular factor information */
797         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
798         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799 
800         /* Create the matrix description */
801         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
802         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
803        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
805        #else
806         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
807        #endif
808         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
809         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
810 
811         /* set the operation */
812         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
813 
814         /* set the matrix */
815         loTriFactor->csrMat = new CsrMatrix;
816         loTriFactor->csrMat->num_rows = A->rmap->n;
817         loTriFactor->csrMat->num_cols = A->cmap->n;
818         loTriFactor->csrMat->num_entries = a->nz;
819 
820         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
821         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
822 
823         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
824         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
825 
826         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
827         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
828 
829         /* Create the solve analysis information */
830         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
831         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
832       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
834                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
835                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
837                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
838         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
839       #endif
840 
841         /* perform the solve analysis */
842         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
843                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
844                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
845                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
846                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
848                                 #endif
849 );CHKERRCUSPARSE(stat);
850         cerr = WaitForCUDA();CHKERRCUDA(cerr);
851         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
852 
853         /* assign the pointer */
854         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
855 
856         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
857         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
858         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
859       } else {
860         /* Fill the upper triangular matrix */
861         offset = 0;
862         for (i=0; i<n; i++) {
863           /* set the pointers */
864           v  = aa + ai[i];
865           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
866 
867           /* first, set the diagonal elements */
868           AAUp[offset] = 1.0/v[nz];
869           AALo[offset] = 1.0/v[nz];
870 
871           offset+=1;
872           if (nz>0) {
873             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
874             for (j=offset; j<offset+nz; j++) {
875               AAUp[j] = -AAUp[j];
876               AALo[j] = AAUp[j]/v[nz];
877             }
878             offset+=nz;
879           }
880         }
881         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
882         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
883         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
884         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
885         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
886       }
887       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
888       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
889     } catch(char *ex) {
890       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
891     }
892   }
893   PetscFunctionReturn(0);
894 }
895 
896 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
897 {
898   PetscErrorCode               ierr;
899   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
900   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
901   IS                           ip = a->row;
902   PetscBool                    perm_identity;
903   PetscInt                     n = A->rmap->n;
904 
905   PetscFunctionBegin;
906   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
908   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
909   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
910 
911   A->offloadmask = PETSC_OFFLOAD_BOTH;
912 
913   /* lower triangular indices */
914   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
915   if (!perm_identity) {
916     IS             iip;
917     const PetscInt *irip,*rip;
918 
919     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
920     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
921     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
922     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
923     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
924     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
925     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
926     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
927     ierr = ISDestroy(&iip);CHKERRQ(ierr);
928     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
929     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
930   }
931   PetscFunctionReturn(0);
932 }
933 
934 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
935 {
936   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
937   IS             isrow = b->row,iscol = b->col;
938   PetscBool      row_identity,col_identity;
939   PetscErrorCode ierr;
940 
941   PetscFunctionBegin;
942   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
943   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
944   B->offloadmask = PETSC_OFFLOAD_CPU;
945   /* determine which version of MatSolve needs to be used. */
946   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
947   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
948   if (row_identity && col_identity) {
949     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
950     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
951     B->ops->matsolve = NULL;
952     B->ops->matsolvetranspose = NULL;
953   } else {
954     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
955     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
956     B->ops->matsolve = NULL;
957     B->ops->matsolvetranspose = NULL;
958   }
959 
960   /* get the triangular factors */
961   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
962   PetscFunctionReturn(0);
963 }
964 
965 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
966 {
967   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
968   IS             ip = b->row;
969   PetscBool      perm_identity;
970   PetscErrorCode ierr;
971 
972   PetscFunctionBegin;
973   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
974   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
975   B->offloadmask = PETSC_OFFLOAD_CPU;
976   /* determine which version of MatSolve needs to be used. */
977   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
978   if (perm_identity) {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   } else {
984     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
985     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
986     B->ops->matsolve = NULL;
987     B->ops->matsolvetranspose = NULL;
988   }
989 
990   /* get the triangular factors */
991   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
992   PetscFunctionReturn(0);
993 }
994 
995 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
996 {
997   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
999   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1000   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1001   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1002   cusparseStatus_t                  stat;
1003   cusparseIndexBase_t               indexBase;
1004   cusparseMatrixType_t              matrixType;
1005   cusparseFillMode_t                fillMode;
1006   cusparseDiagType_t                diagType;
1007   cudaError_t                       cerr;
1008   PetscErrorCode                    ierr;
1009 
1010   PetscFunctionBegin;
1011   /* allocate space for the transpose of the lower triangular factor */
1012   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1013   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1014 
1015   /* set the matrix descriptors of the lower triangular factor */
1016   matrixType = cusparseGetMatType(loTriFactor->descr);
1017   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1018   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1019     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1020   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1021 
1022   /* Create the matrix description */
1023   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1024   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1025   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1026   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1027   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1028 
1029   /* set the operation */
1030   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1031 
1032   /* allocate GPU space for the CSC of the lower triangular factor*/
1033   loTriFactorT->csrMat = new CsrMatrix;
1034   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1035   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1036   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1037   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1038   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1039   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1040 
1041   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1042 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1043   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045                                        loTriFactor->csrMat->values->data().get(),
1046                                        loTriFactor->csrMat->row_offsets->data().get(),
1047                                        loTriFactor->csrMat->column_indices->data().get(),
1048                                        loTriFactorT->csrMat->values->data().get(),
1049                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1050                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1051                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1052   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1053 #endif
1054 
1055   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1056   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1057                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1058                           loTriFactor->csrMat->values->data().get(),
1059                           loTriFactor->csrMat->row_offsets->data().get(),
1060                           loTriFactor->csrMat->column_indices->data().get(),
1061                           loTriFactorT->csrMat->values->data().get(),
1062                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1063                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1064                           CUSPARSE_ACTION_NUMERIC, indexBase,
1065                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1066                         #else
1067                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068                           CUSPARSE_ACTION_NUMERIC, indexBase
1069                         #endif
1070 );CHKERRCUSPARSE(stat);
1071   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1072   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1073 
1074   /* Create the solve analysis information */
1075   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1076   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1077 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1078   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1080                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1081                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1082                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1083   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1084 #endif
1085 
1086   /* perform the solve analysis */
1087   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1088                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1089                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1091                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1092                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1093                           #endif
1094 );CHKERRCUSPARSE(stat);
1095   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1096   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1097 
1098   /* assign the pointer */
1099   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100 
1101   /*********************************************/
1102   /* Now the Transpose of the Upper Tri Factor */
1103   /*********************************************/
1104 
1105   /* allocate space for the transpose of the upper triangular factor */
1106   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1107   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108 
1109   /* set the matrix descriptors of the upper triangular factor */
1110   matrixType = cusparseGetMatType(upTriFactor->descr);
1111   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1112   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1113     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1114   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115 
1116   /* Create the matrix description */
1117   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1118   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1119   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1120   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1121   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122 
1123   /* set the operation */
1124   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125 
1126   /* allocate GPU space for the CSC of the upper triangular factor*/
1127   upTriFactorT->csrMat = new CsrMatrix;
1128   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1129   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1130   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1131   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1132   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1133   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134 
1135   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1136 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1137   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1138                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139                                 upTriFactor->csrMat->values->data().get(),
1140                                 upTriFactor->csrMat->row_offsets->data().get(),
1141                                 upTriFactor->csrMat->column_indices->data().get(),
1142                                 upTriFactorT->csrMat->values->data().get(),
1143                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1145                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1146   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1147 #endif
1148 
1149   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1150   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1151                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1152                           upTriFactor->csrMat->values->data().get(),
1153                           upTriFactor->csrMat->row_offsets->data().get(),
1154                           upTriFactor->csrMat->column_indices->data().get(),
1155                           upTriFactorT->csrMat->values->data().get(),
1156                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1157                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1158                           CUSPARSE_ACTION_NUMERIC, indexBase,
1159                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1160                         #else
1161                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162                           CUSPARSE_ACTION_NUMERIC, indexBase
1163                         #endif
1164 );CHKERRCUSPARSE(stat);
1165   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1166   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1167 
1168   /* Create the solve analysis information */
1169   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1170   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1171   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1172   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1176                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1177   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1178   #endif
1179 
1180   /* perform the solve analysis */
1181   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1182                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1183                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1184                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1185                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1186                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1187                           #endif
1188 );CHKERRCUSPARSE(stat);
1189   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1190   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1191 
1192   /* assign the pointer */
1193   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1194   PetscFunctionReturn(0);
1195 }
1196 
1197 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1198 {
1199   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1200   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1201   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1202   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203   cusparseStatus_t             stat;
1204   cusparseIndexBase_t          indexBase;
1205   cudaError_t                  err;
1206   PetscErrorCode               ierr;
1207 
1208   PetscFunctionBegin;
1209   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1210   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1212   /* create cusparse matrix */
1213   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1214   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1215   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1216   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1217   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1218 
1219   /* set alpha and beta */
1220   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1221   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1222   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1223   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1225   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1226   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1227 
1228   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1229     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1230     CsrMatrix *matrixT= new CsrMatrix;
1231     matrixT->num_rows = A->cmap->n;
1232     matrixT->num_cols = A->rmap->n;
1233     matrixT->num_entries = a->nz;
1234     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1235     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1236     matrixT->values = new THRUSTARRAY(a->nz);
1237 
1238     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1239     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1240 
1241     /* compute the transpose, i.e. the CSC */
1242    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1243     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1244                                   A->cmap->n, matrix->num_entries,
1245                                   matrix->values->data().get(),
1246                                   cusparsestruct->rowoffsets_gpu->data().get(),
1247                                   matrix->column_indices->data().get(),
1248                                   matrixT->values->data().get(),
1249                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1251                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1252     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1253    #endif
1254 
1255     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1256                             A->cmap->n, matrix->num_entries,
1257                             matrix->values->data().get(),
1258                             cusparsestruct->rowoffsets_gpu->data().get(),
1259                             matrix->column_indices->data().get(),
1260                             matrixT->values->data().get(),
1261                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1263                             CUSPARSE_ACTION_NUMERIC,indexBase,
1264                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1265                           #else
1266                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1267                             CUSPARSE_ACTION_NUMERIC, indexBase
1268                           #endif
1269 );CHKERRCUSPARSE(stat);
1270     matstructT->mat = matrixT;
1271 
1272    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273     stat = cusparseCreateCsr(&matstructT->matDescr,
1274                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1275                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1276                              matrixT->values->data().get(),
1277                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1278                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1279    #endif
1280   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1281    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1282     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1283    #else
1284     CsrMatrix *temp  = new CsrMatrix;
1285     CsrMatrix *tempT = new CsrMatrix;
1286     /* First convert HYB to CSR */
1287     temp->num_rows = A->rmap->n;
1288     temp->num_cols = A->cmap->n;
1289     temp->num_entries = a->nz;
1290     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1292     temp->values = new THRUSTARRAY(a->nz);
1293 
1294     stat = cusparse_hyb2csr(cusparsestruct->handle,
1295                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1296                             temp->values->data().get(),
1297                             temp->row_offsets->data().get(),
1298                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1299 
1300     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1301     tempT->num_rows = A->rmap->n;
1302     tempT->num_cols = A->cmap->n;
1303     tempT->num_entries = a->nz;
1304     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1305     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1306     tempT->values = new THRUSTARRAY(a->nz);
1307 
1308     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1309                             temp->num_cols, temp->num_entries,
1310                             temp->values->data().get(),
1311                             temp->row_offsets->data().get(),
1312                             temp->column_indices->data().get(),
1313                             tempT->values->data().get(),
1314                             tempT->column_indices->data().get(),
1315                             tempT->row_offsets->data().get(),
1316                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1317 
1318     /* Last, convert CSC to HYB */
1319     cusparseHybMat_t hybMat;
1320     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1321     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1322       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1323     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1324                             matstructT->descr, tempT->values->data().get(),
1325                             tempT->row_offsets->data().get(),
1326                             tempT->column_indices->data().get(),
1327                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1328 
1329     /* assign the pointer */
1330     matstructT->mat = hybMat;
1331     /* delete temporaries */
1332     if (tempT) {
1333       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1334       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1335       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1336       delete (CsrMatrix*) tempT;
1337     }
1338     if (temp) {
1339       if (temp->values) delete (THRUSTARRAY*) temp->values;
1340       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1341       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1342       delete (CsrMatrix*) temp;
1343     }
1344    #endif
1345   }
1346   err  = WaitForCUDA();CHKERRCUDA(err);
1347   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1348   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1349   /* the compressed row indices is not used for matTranspose */
1350   matstructT->cprowIndices = NULL;
1351   /* assign the pointer */
1352   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1353   PetscFunctionReturn(0);
1354 }
1355 
1356 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1357 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1358 {
1359   PetscInt                              n = xx->map->n;
1360   const PetscScalar                     *barray;
1361   PetscScalar                           *xarray;
1362   thrust::device_ptr<const PetscScalar> bGPU;
1363   thrust::device_ptr<PetscScalar>       xGPU;
1364   cusparseStatus_t                      stat;
1365   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1366   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1367   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1368   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1369   PetscErrorCode                        ierr;
1370   cudaError_t                           cerr;
1371 
1372   PetscFunctionBegin;
1373   /* Analyze the matrix and create the transpose ... on the fly */
1374   if (!loTriFactorT && !upTriFactorT) {
1375     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1376     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1377     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1378   }
1379 
1380   /* Get the GPU pointers */
1381   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1382   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1383   xGPU = thrust::device_pointer_cast(xarray);
1384   bGPU = thrust::device_pointer_cast(barray);
1385 
1386   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1387   /* First, reorder with the row permutation */
1388   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1389                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1390                xGPU);
1391 
1392   /* First, solve U */
1393   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1394                         upTriFactorT->csrMat->num_rows,
1395                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1396                         upTriFactorT->csrMat->num_entries,
1397                       #endif
1398                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1399                         upTriFactorT->csrMat->values->data().get(),
1400                         upTriFactorT->csrMat->row_offsets->data().get(),
1401                         upTriFactorT->csrMat->column_indices->data().get(),
1402                         upTriFactorT->solveInfo,
1403                         xarray, tempGPU->data().get()
1404                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1405                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1406                       #endif
1407 );CHKERRCUSPARSE(stat);
1408 
1409   /* Then, solve L */
1410   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1411                         loTriFactorT->csrMat->num_rows,
1412                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1413                         loTriFactorT->csrMat->num_entries,
1414                       #endif
1415                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1416                         loTriFactorT->csrMat->values->data().get(),
1417                         loTriFactorT->csrMat->row_offsets->data().get(),
1418                         loTriFactorT->csrMat->column_indices->data().get(),
1419                         loTriFactorT->solveInfo,
1420                         tempGPU->data().get(), xarray
1421                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1422                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1423                       #endif
1424 );CHKERRCUSPARSE(stat);
1425 
1426   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1427   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1428                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1429                tempGPU->begin());
1430 
1431   /* Copy the temporary to the full solution. */
1432   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1433 
1434   /* restore */
1435   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1436   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1437   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1438   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1440   PetscFunctionReturn(0);
1441 }
1442 
1443 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444 {
1445   const PetscScalar                 *barray;
1446   PetscScalar                       *xarray;
1447   cusparseStatus_t                  stat;
1448   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1449   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1450   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1451   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1452   PetscErrorCode                    ierr;
1453   cudaError_t                       cerr;
1454 
1455   PetscFunctionBegin;
1456   /* Analyze the matrix and create the transpose ... on the fly */
1457   if (!loTriFactorT && !upTriFactorT) {
1458     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1459     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461   }
1462 
1463   /* Get the GPU pointers */
1464   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1466 
1467   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468   /* First, solve U */
1469   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1470                         upTriFactorT->csrMat->num_rows,
1471                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472                         upTriFactorT->csrMat->num_entries,
1473                       #endif
1474                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1475                         upTriFactorT->csrMat->values->data().get(),
1476                         upTriFactorT->csrMat->row_offsets->data().get(),
1477                         upTriFactorT->csrMat->column_indices->data().get(),
1478                         upTriFactorT->solveInfo,
1479                         barray, tempGPU->data().get()
1480                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1482                       #endif
1483 );CHKERRCUSPARSE(stat);
1484 
1485   /* Then, solve L */
1486   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1487                         loTriFactorT->csrMat->num_rows,
1488                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1489                         loTriFactorT->csrMat->num_entries,
1490                       #endif
1491                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1492                         loTriFactorT->csrMat->values->data().get(),
1493                         loTriFactorT->csrMat->row_offsets->data().get(),
1494                         loTriFactorT->csrMat->column_indices->data().get(),
1495                         loTriFactorT->solveInfo,
1496                         tempGPU->data().get(), xarray
1497                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1498                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1499                       #endif
1500 );CHKERRCUSPARSE(stat);
1501 
1502   /* restore */
1503   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1504   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1505   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1506   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1512 {
1513   const PetscScalar                     *barray;
1514   PetscScalar                           *xarray;
1515   thrust::device_ptr<const PetscScalar> bGPU;
1516   thrust::device_ptr<PetscScalar>       xGPU;
1517   cusparseStatus_t                      stat;
1518   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1519   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1520   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1521   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1522   PetscErrorCode                        ierr;
1523   cudaError_t                           cerr;
1524 
1525   PetscFunctionBegin;
1526 
1527   /* Get the GPU pointers */
1528   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530   xGPU = thrust::device_pointer_cast(xarray);
1531   bGPU = thrust::device_pointer_cast(barray);
1532 
1533   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534   /* First, reorder with the row permutation */
1535   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1536                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1537                tempGPU->begin());
1538 
1539   /* Next, solve L */
1540   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1541                         loTriFactor->csrMat->num_rows,
1542                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543                         loTriFactor->csrMat->num_entries,
1544                       #endif
1545                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1546                         loTriFactor->csrMat->values->data().get(),
1547                         loTriFactor->csrMat->row_offsets->data().get(),
1548                         loTriFactor->csrMat->column_indices->data().get(),
1549                         loTriFactor->solveInfo,
1550                         tempGPU->data().get(), xarray
1551                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1553                       #endif
1554 );CHKERRCUSPARSE(stat);
1555 
1556   /* Then, solve U */
1557   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1558                         upTriFactor->csrMat->num_rows,
1559                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1560                         upTriFactor->csrMat->num_entries,
1561                       #endif
1562                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1563                         upTriFactor->csrMat->values->data().get(),
1564                         upTriFactor->csrMat->row_offsets->data().get(),
1565                         upTriFactor->csrMat->column_indices->data().get(),
1566                         upTriFactor->solveInfo,
1567                         xarray, tempGPU->data().get()
1568                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1570                       #endif
1571 );CHKERRCUSPARSE(stat);
1572 
1573   /* Last, reorder with the column permutation */
1574   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1575                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1576                xGPU);
1577 
1578   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1581   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587 {
1588   const PetscScalar                 *barray;
1589   PetscScalar                       *xarray;
1590   cusparseStatus_t                  stat;
1591   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595   PetscErrorCode                    ierr;
1596   cudaError_t                       cerr;
1597 
1598   PetscFunctionBegin;
1599   /* Get the GPU pointers */
1600   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1601   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1602 
1603   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1604   /* First, solve L */
1605   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1606                         loTriFactor->csrMat->num_rows,
1607                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608                         loTriFactor->csrMat->num_entries,
1609                       #endif
1610                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1611                         loTriFactor->csrMat->values->data().get(),
1612                         loTriFactor->csrMat->row_offsets->data().get(),
1613                         loTriFactor->csrMat->column_indices->data().get(),
1614                         loTriFactor->solveInfo,
1615                         barray, tempGPU->data().get()
1616                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1618                       #endif
1619 );CHKERRCUSPARSE(stat);
1620 
1621   /* Next, solve U */
1622   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1623                         upTriFactor->csrMat->num_rows,
1624                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1625                         upTriFactor->csrMat->num_entries,
1626                       #endif
1627                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1628                         upTriFactor->csrMat->values->data().get(),
1629                         upTriFactor->csrMat->row_offsets->data().get(),
1630                         upTriFactor->csrMat->column_indices->data().get(),
1631                         upTriFactor->solveInfo,
1632                         tempGPU->data().get(), xarray
1633                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1635                       #endif
1636 );CHKERRCUSPARSE(stat);
1637 
1638   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1639   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1641   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1642   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1643   PetscFunctionReturn(0);
1644 }
1645 
1646 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1647 {
1648   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1649   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1650   cudaError_t        cerr;
1651   PetscErrorCode     ierr;
1652 
1653   PetscFunctionBegin;
1654   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1655     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1656 
1657     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1658     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1659     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1660     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1661     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1662     A->offloadmask = PETSC_OFFLOAD_BOTH;
1663   }
1664   PetscFunctionReturn(0);
1665 }
1666 
1667 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1668 {
1669   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1670   PetscErrorCode ierr;
1671 
1672   PetscFunctionBegin;
1673   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1674   *array = a->a;
1675   A->offloadmask = PETSC_OFFLOAD_CPU;
1676   PetscFunctionReturn(0);
1677 }
1678 
1679 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1680 {
1681   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1682   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1683   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1684   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1685   PetscErrorCode               ierr;
1686   cusparseStatus_t             stat;
1687   cudaError_t                  err;
1688 
1689   PetscFunctionBegin;
1690   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1691   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1692     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1693       /* Copy values only */
1694       CsrMatrix *matrix,*matrixT;
1695       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1696 
1697       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1698       matrix->values->assign(a->a, a->a+a->nz);
1699       err  = WaitForCUDA();CHKERRCUDA(err);
1700       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1701       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1702 
1703       /* Update matT when it was built before */
1704       if (cusparsestruct->matTranspose) {
1705         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1706         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1707         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1708         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1709                             A->cmap->n, matrix->num_entries,
1710                             matrix->values->data().get(),
1711                             cusparsestruct->rowoffsets_gpu->data().get(),
1712                             matrix->column_indices->data().get(),
1713                             matrixT->values->data().get(),
1714                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1715                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1716                             CUSPARSE_ACTION_NUMERIC,indexBase,
1717                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1718                           #else
1719                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1720                             CUSPARSE_ACTION_NUMERIC, indexBase
1721                           #endif
1722 );CHKERRCUSPARSE(stat);
1723         err  = WaitForCUDA();CHKERRCUDA(err);
1724         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1725       }
1726     } else {
1727       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1728       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1729       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1730       delete cusparsestruct->workVector;
1731       delete cusparsestruct->rowoffsets_gpu;
1732       try {
1733         if (a->compressedrow.use) {
1734           m    = a->compressedrow.nrows;
1735           ii   = a->compressedrow.i;
1736           ridx = a->compressedrow.rindex;
1737         } else {
1738           m    = A->rmap->n;
1739           ii   = a->i;
1740           ridx = NULL;
1741         }
1742         cusparsestruct->nrows = m;
1743 
1744         /* create cusparse matrix */
1745         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1746         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1747         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1748         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1749 
1750         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1751         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1752         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1753         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1754         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1755         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1756         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1757 
1758         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1759         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1760           /* set the matrix */
1761           CsrMatrix *mat= new CsrMatrix;
1762           mat->num_rows = m;
1763           mat->num_cols = A->cmap->n;
1764           mat->num_entries = a->nz;
1765           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1766           mat->row_offsets->assign(ii, ii + m+1);
1767 
1768           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1769           mat->column_indices->assign(a->j, a->j+a->nz);
1770 
1771           mat->values = new THRUSTARRAY(a->nz);
1772           mat->values->assign(a->a, a->a+a->nz);
1773 
1774           /* assign the pointer */
1775           matstruct->mat = mat;
1776          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1777           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1778             stat = cusparseCreateCsr(&matstruct->matDescr,
1779                                     mat->num_rows, mat->num_cols, mat->num_entries,
1780                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1781                                     mat->values->data().get(),
1782                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1783                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1784           }
1785          #endif
1786         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1787          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1788           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1789          #else
1790           CsrMatrix *mat= new CsrMatrix;
1791           mat->num_rows = m;
1792           mat->num_cols = A->cmap->n;
1793           mat->num_entries = a->nz;
1794           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1795           mat->row_offsets->assign(ii, ii + m+1);
1796 
1797           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1798           mat->column_indices->assign(a->j, a->j+a->nz);
1799 
1800           mat->values = new THRUSTARRAY(a->nz);
1801           mat->values->assign(a->a, a->a+a->nz);
1802 
1803           cusparseHybMat_t hybMat;
1804           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1805           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1806             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1807           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1808               matstruct->descr, mat->values->data().get(),
1809               mat->row_offsets->data().get(),
1810               mat->column_indices->data().get(),
1811               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1812           /* assign the pointer */
1813           matstruct->mat = hybMat;
1814 
1815           if (mat) {
1816             if (mat->values) delete (THRUSTARRAY*)mat->values;
1817             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1818             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1819             delete (CsrMatrix*)mat;
1820           }
1821          #endif
1822         }
1823 
1824         /* assign the compressed row indices */
1825         if (a->compressedrow.use) {
1826           cusparsestruct->workVector = new THRUSTARRAY(m);
1827           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1828           matstruct->cprowIndices->assign(ridx,ridx+m);
1829           tmp = m;
1830         } else {
1831           cusparsestruct->workVector = NULL;
1832           matstruct->cprowIndices    = NULL;
1833           tmp = 0;
1834         }
1835         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1836 
1837         /* assign the pointer */
1838         cusparsestruct->mat = matstruct;
1839       } catch(char *ex) {
1840         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1841       }
1842       err  = WaitForCUDA();CHKERRCUDA(err);
1843       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1844       cusparsestruct->nonzerostate = A->nonzerostate;
1845     }
1846     A->offloadmask = PETSC_OFFLOAD_BOTH;
1847   }
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 struct VecCUDAPlusEquals
1852 {
1853   template <typename Tuple>
1854   __host__ __device__
1855   void operator()(Tuple t)
1856   {
1857     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1858   }
1859 };
1860 
1861 struct VecCUDAEquals
1862 {
1863   template <typename Tuple>
1864   __host__ __device__
1865   void operator()(Tuple t)
1866   {
1867     thrust::get<1>(t) = thrust::get<0>(t);
1868   }
1869 };
1870 
1871 struct VecCUDAEqualsReverse
1872 {
1873   template <typename Tuple>
1874   __host__ __device__
1875   void operator()(Tuple t)
1876   {
1877     thrust::get<0>(t) = thrust::get<1>(t);
1878   }
1879 };
1880 
1881 struct MatMatCusparse {
1882   PetscBool             cisdense;
1883   PetscScalar           *Bt;
1884   Mat                   X;
1885   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1886   PetscLogDouble        flops;
1887   CsrMatrix             *Bcsr;
1888 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1889   cusparseSpMatDescr_t  matSpBDescr;
1890   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1891   cusparseDnMatDescr_t  matBDescr;
1892   cusparseDnMatDescr_t  matCDescr;
1893   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1894   size_t                mmBufferSize;
1895   void                  *mmBuffer;
1896   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1897   cusparseSpGEMMDescr_t spgemmDesc;
1898 #endif
1899 };
1900 
1901 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1902 {
1903   PetscErrorCode   ierr;
1904   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1905   cudaError_t      cerr;
1906  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1907   cusparseStatus_t stat;
1908  #endif
1909 
1910   PetscFunctionBegin;
1911   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1912   delete mmdata->Bcsr;
1913  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1914   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1915   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1916   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1917   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1918   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1919   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1920  #endif
1921   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1922   ierr = PetscFree(data);CHKERRQ(ierr);
1923   PetscFunctionReturn(0);
1924 }
1925 
1926 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1927 
1928 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1929 {
1930   Mat_Product                  *product = C->product;
1931   Mat                          A,B;
1932   PetscInt                     m,n,blda,clda;
1933   PetscBool                    flg,biscuda;
1934   Mat_SeqAIJCUSPARSE           *cusp;
1935   cusparseStatus_t             stat;
1936   cusparseOperation_t          opA;
1937   const PetscScalar            *barray;
1938   PetscScalar                  *carray;
1939   PetscErrorCode               ierr;
1940   MatMatCusparse               *mmdata;
1941   Mat_SeqAIJCUSPARSEMultStruct *mat;
1942   CsrMatrix                    *csrmat;
1943   cudaError_t                  cerr;
1944 
1945   PetscFunctionBegin;
1946   MatCheckProduct(C,1);
1947   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1948   mmdata = (MatMatCusparse*)product->data;
1949   A    = product->A;
1950   B    = product->B;
1951   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1952   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1953   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1954      Instead of silently accepting the wrong answer, I prefer to raise the error */
1955   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1956   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1957   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1958   switch (product->type) {
1959   case MATPRODUCT_AB:
1960   case MATPRODUCT_PtAP:
1961     mat = cusp->mat;
1962     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1963     m   = A->rmap->n;
1964     n   = B->cmap->n;
1965     break;
1966   case MATPRODUCT_AtB:
1967     if (!cusp->transgen) {
1968       mat = cusp->mat;
1969       opA = CUSPARSE_OPERATION_TRANSPOSE;
1970     } else {
1971       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1972       mat  = cusp->matTranspose;
1973       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1974     }
1975     m = A->cmap->n;
1976     n = B->cmap->n;
1977     break;
1978   case MATPRODUCT_ABt:
1979   case MATPRODUCT_RARt:
1980     mat = cusp->mat;
1981     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1982     m   = A->rmap->n;
1983     n   = B->rmap->n;
1984     break;
1985   default:
1986     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1987   }
1988   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1989   csrmat = (CsrMatrix*)mat->mat;
1990   /* if the user passed a CPU matrix, copy the data to the GPU */
1991   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1992   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1993   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1994 
1995   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1996   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1997     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1998     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1999   } else {
2000     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2001     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2002   }
2003 
2004   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2005  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2006   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2007   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2008   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2009     size_t mmBufferSize;
2010     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2011     if (!mmdata->matBDescr) {
2012       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2013       mmdata->Blda = blda;
2014     }
2015 
2016     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2017     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2018       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2019       mmdata->Clda = clda;
2020     }
2021 
2022     if (!mat->matDescr) {
2023       stat = cusparseCreateCsr(&mat->matDescr,
2024                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2025                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2026                                csrmat->values->data().get(),
2027                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2028                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2029     }
2030     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2031                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2032                                    mmdata->matCDescr,cusparse_scalartype,
2033                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2034     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2035       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2036       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2037       mmdata->mmBufferSize = mmBufferSize;
2038     }
2039     mmdata->initialized = PETSC_TRUE;
2040   } else {
2041     /* to be safe, always update pointers of the mats */
2042     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2043     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2044     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2045   }
2046 
2047   /* do cusparseSpMM, which supports transpose on B */
2048   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2049                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2050                       mmdata->matCDescr,cusparse_scalartype,
2051                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2052  #else
2053   PetscInt k;
2054   /* cusparseXcsrmm does not support transpose on B */
2055   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2056     cublasHandle_t cublasv2handle;
2057     cublasStatus_t cerr;
2058 
2059     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2060     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2061                        B->cmap->n,B->rmap->n,
2062                        &PETSC_CUSPARSE_ONE ,barray,blda,
2063                        &PETSC_CUSPARSE_ZERO,barray,blda,
2064                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2065     blda = B->cmap->n;
2066     k    = B->cmap->n;
2067   } else {
2068     k    = B->rmap->n;
2069   }
2070 
2071   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2072   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2073                            csrmat->num_entries,mat->alpha_one,mat->descr,
2074                            csrmat->values->data().get(),
2075                            csrmat->row_offsets->data().get(),
2076                            csrmat->column_indices->data().get(),
2077                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2078                            carray,clda);CHKERRCUSPARSE(stat);
2079  #endif
2080   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2081   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2082   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2083   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2084   if (product->type == MATPRODUCT_RARt) {
2085     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2086     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2087   } else if (product->type == MATPRODUCT_PtAP) {
2088     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2089     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2090   } else {
2091     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2092   }
2093   if (mmdata->cisdense) {
2094     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2095   }
2096   if (!biscuda) {
2097     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2098   }
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2103 {
2104   Mat_Product        *product = C->product;
2105   Mat                A,B;
2106   PetscInt           m,n;
2107   PetscBool          cisdense,flg;
2108   PetscErrorCode     ierr;
2109   MatMatCusparse     *mmdata;
2110   Mat_SeqAIJCUSPARSE *cusp;
2111 
2112   PetscFunctionBegin;
2113   MatCheckProduct(C,1);
2114   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2115   A    = product->A;
2116   B    = product->B;
2117   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2118   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2119   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2120   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2121   switch (product->type) {
2122   case MATPRODUCT_AB:
2123     m = A->rmap->n;
2124     n = B->cmap->n;
2125     break;
2126   case MATPRODUCT_AtB:
2127     m = A->cmap->n;
2128     n = B->cmap->n;
2129     break;
2130   case MATPRODUCT_ABt:
2131     m = A->rmap->n;
2132     n = B->rmap->n;
2133     break;
2134   case MATPRODUCT_PtAP:
2135     m = B->cmap->n;
2136     n = B->cmap->n;
2137     break;
2138   case MATPRODUCT_RARt:
2139     m = B->rmap->n;
2140     n = B->rmap->n;
2141     break;
2142   default:
2143     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2144   }
2145   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2146   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2147   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2148   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2149 
2150   /* product data */
2151   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2152   mmdata->cisdense = cisdense;
2153  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2154   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2155   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2156     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2157   }
2158  #endif
2159   /* for these products we need intermediate storage */
2160   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2161     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2162     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2163     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2164       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2165     } else {
2166       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2167     }
2168   }
2169   C->product->data    = mmdata;
2170   C->product->destroy = MatDestroy_MatMatCusparse;
2171 
2172   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2173   PetscFunctionReturn(0);
2174 }
2175 
2176 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2177 {
2178   Mat_Product                  *product = C->product;
2179   Mat                          A,B;
2180   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2181   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2182   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2183   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2184   PetscBool                    flg;
2185   PetscErrorCode               ierr;
2186   cusparseStatus_t             stat;
2187   cudaError_t                  cerr;
2188   MatProductType               ptype;
2189   MatMatCusparse               *mmdata;
2190 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2191   cusparseSpMatDescr_t         BmatSpDescr;
2192 #endif
2193 
2194   PetscFunctionBegin;
2195   MatCheckProduct(C,1);
2196   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2197   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2198   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2199   mmdata = (MatMatCusparse*)C->product->data;
2200   A = product->A;
2201   B = product->B;
2202   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2203     mmdata->reusesym = PETSC_FALSE;
2204     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2205     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2206     Cmat = Ccusp->mat;
2207     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2208     Ccsr = (CsrMatrix*)Cmat->mat;
2209     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2210     goto finalize;
2211   }
2212   if (!c->nz) goto finalize;
2213   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2214   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2215   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2216   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2217   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2218   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2219   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2220   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2221   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2222   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2223   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2224   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2225   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2226   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2227 
2228   ptype = product->type;
2229   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2230   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2231   switch (ptype) {
2232   case MATPRODUCT_AB:
2233     Amat = Acusp->mat;
2234     Bmat = Bcusp->mat;
2235     break;
2236   case MATPRODUCT_AtB:
2237     Amat = Acusp->matTranspose;
2238     Bmat = Bcusp->mat;
2239     break;
2240   case MATPRODUCT_ABt:
2241     Amat = Acusp->mat;
2242     Bmat = Bcusp->matTranspose;
2243     break;
2244   default:
2245     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2246   }
2247   Cmat = Ccusp->mat;
2248   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2249   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2250   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2251   Acsr = (CsrMatrix*)Amat->mat;
2252   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2253   Ccsr = (CsrMatrix*)Cmat->mat;
2254   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2255   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2256   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2257   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2258 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2259   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2260   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2261                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2262                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2263                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2264   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2265                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2266                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2267 #else
2268   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2269                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2270                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2271                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2272                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2273 #endif
2274   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2275   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2276   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2277   C->offloadmask = PETSC_OFFLOAD_GPU;
2278 finalize:
2279   /* shorter version of MatAssemblyEnd_SeqAIJ */
2280   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2281   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2282   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2283   c->reallocs         = 0;
2284   C->info.mallocs    += 0;
2285   C->info.nz_unneeded = 0;
2286   C->assembled = C->was_assembled = PETSC_TRUE;
2287   C->num_ass++;
2288   PetscFunctionReturn(0);
2289 }
2290 
2291 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2292 {
2293   Mat_Product                  *product = C->product;
2294   Mat                          A,B;
2295   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2296   Mat_SeqAIJ                   *a,*b,*c;
2297   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2298   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2299   PetscInt                     i,j,m,n,k;
2300   PetscBool                    flg;
2301   PetscErrorCode               ierr;
2302   cusparseStatus_t             stat;
2303   cudaError_t                  cerr;
2304   MatProductType               ptype;
2305   MatMatCusparse               *mmdata;
2306   PetscLogDouble               flops;
2307   PetscBool                    biscompressed,ciscompressed;
2308 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2309   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2310   size_t                       bufSize2;
2311   cusparseSpMatDescr_t         BmatSpDescr;
2312 #else
2313   int                          cnz;
2314 #endif
2315 
2316   PetscFunctionBegin;
2317   MatCheckProduct(C,1);
2318   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2319   A    = product->A;
2320   B    = product->B;
2321   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2322   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2323   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2324   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2325   a = (Mat_SeqAIJ*)A->data;
2326   b = (Mat_SeqAIJ*)B->data;
2327   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2328   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2329   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2330   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2331 
2332   /* product data */
2333   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2334   C->product->data    = mmdata;
2335   C->product->destroy = MatDestroy_MatMatCusparse;
2336 
2337   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2338   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2339   ptype = product->type;
2340   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2341   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2342   biscompressed = PETSC_FALSE;
2343   ciscompressed = PETSC_FALSE;
2344   switch (ptype) {
2345   case MATPRODUCT_AB:
2346     m = A->rmap->n;
2347     n = B->cmap->n;
2348     k = A->cmap->n;
2349     Amat = Acusp->mat;
2350     Bmat = Bcusp->mat;
2351     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2352     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2353     break;
2354   case MATPRODUCT_AtB:
2355     m = A->cmap->n;
2356     n = B->cmap->n;
2357     k = A->rmap->n;
2358     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2359     Amat = Acusp->matTranspose;
2360     Bmat = Bcusp->mat;
2361     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2362     break;
2363   case MATPRODUCT_ABt:
2364     m = A->rmap->n;
2365     n = B->rmap->n;
2366     k = A->cmap->n;
2367     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2368     Amat = Acusp->mat;
2369     Bmat = Bcusp->matTranspose;
2370     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2371     break;
2372   default:
2373     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2374   }
2375 
2376   /* create cusparse matrix */
2377   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2378   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2379   c     = (Mat_SeqAIJ*)C->data;
2380   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2381   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2382   Ccsr  = new CsrMatrix;
2383 
2384   c->compressedrow.use = ciscompressed;
2385   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2386     c->compressedrow.nrows = a->compressedrow.nrows;
2387     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2388     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2389     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2390     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2391     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2392   } else {
2393     c->compressedrow.nrows  = 0;
2394     c->compressedrow.i      = NULL;
2395     c->compressedrow.rindex = NULL;
2396     Ccusp->workVector       = NULL;
2397     Cmat->cprowIndices      = NULL;
2398   }
2399   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2400   Ccusp->mat      = Cmat;
2401   Ccusp->mat->mat = Ccsr;
2402   Ccsr->num_rows    = Ccusp->nrows;
2403   Ccsr->num_cols    = n;
2404   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2405   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2406   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2407   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2408   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2409   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2410   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2411   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2412   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2413   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2414   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2415     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2416     c->nz = 0;
2417     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2418     Ccsr->values = new THRUSTARRAY(c->nz);
2419     goto finalizesym;
2420   }
2421 
2422   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2423   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2424   Acsr = (CsrMatrix*)Amat->mat;
2425   if (!biscompressed) {
2426     Bcsr = (CsrMatrix*)Bmat->mat;
2427 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2428     BmatSpDescr = Bmat->matDescr;
2429 #endif
2430   } else { /* we need to use row offsets for the full matrix */
2431     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2432     Bcsr = new CsrMatrix;
2433     Bcsr->num_rows       = B->rmap->n;
2434     Bcsr->num_cols       = cBcsr->num_cols;
2435     Bcsr->num_entries    = cBcsr->num_entries;
2436     Bcsr->column_indices = cBcsr->column_indices;
2437     Bcsr->values         = cBcsr->values;
2438     if (!Bcusp->rowoffsets_gpu) {
2439       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2440       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2441       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2442     }
2443     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2444     mmdata->Bcsr = Bcsr;
2445 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2446     if (Bcsr->num_rows && Bcsr->num_cols) {
2447       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2448                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2449                                Bcsr->values->data().get(),
2450                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2451                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2452     }
2453     BmatSpDescr = mmdata->matSpBDescr;
2454 #endif
2455   }
2456   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2457   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2458   /* precompute flops count */
2459   if (ptype == MATPRODUCT_AB) {
2460     for (i=0, flops = 0; i<A->rmap->n; i++) {
2461       const PetscInt st = a->i[i];
2462       const PetscInt en = a->i[i+1];
2463       for (j=st; j<en; j++) {
2464         const PetscInt brow = a->j[j];
2465         flops += 2.*(b->i[brow+1] - b->i[brow]);
2466       }
2467     }
2468   } else if (ptype == MATPRODUCT_AtB) {
2469     for (i=0, flops = 0; i<A->rmap->n; i++) {
2470       const PetscInt anzi = a->i[i+1] - a->i[i];
2471       const PetscInt bnzi = b->i[i+1] - b->i[i];
2472       flops += (2.*anzi)*bnzi;
2473     }
2474   } else { /* TODO */
2475     flops = 0.;
2476   }
2477 
2478   mmdata->flops = flops;
2479   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2480 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2481   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2482   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2483                            NULL, NULL, NULL,
2484                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2485                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2486   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2487   /* ask bufferSize bytes for external memory */
2488   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2489                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2490                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2491                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2492   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat);
2493   /* inspect the matrices A and B to understand the memory requirement for the next step */
2494   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2495                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2496                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2497                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2498   /* ask bufferSize again bytes for external memory */
2499   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2500                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2501                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2502                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2503   /* The CUSPARSE documentation is not clear, nor the API
2504      We need both buffers to perform the operations properly!
2505      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2506      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2507      is stored in the descriptor! What a messy API... */
2508   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat);
2509   /* compute the intermediate product of A * B */
2510   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2511                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2512                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2513                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2514   /* get matrix C non-zero entries C_nnz1 */
2515   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2516   c->nz = (PetscInt) C_nnz1;
2517   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2518   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2519   Ccsr->values = new THRUSTARRAY(c->nz);
2520   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2521   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2522                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2523   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2524                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2525                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2526 #else
2527   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2528   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2529                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2530                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2531                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2532                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2533   c->nz = cnz;
2534   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2535   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2536   Ccsr->values = new THRUSTARRAY(c->nz);
2537   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2538 
2539   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2540   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2541      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2542      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2543   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2545                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2546                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2547                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2548 #endif
2549   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2550   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2551   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2552 finalizesym:
2553   c->singlemalloc = PETSC_FALSE;
2554   c->free_a       = PETSC_TRUE;
2555   c->free_ij      = PETSC_TRUE;
2556   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2557   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2558   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2559     PetscInt *d_i = c->i;
2560     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2561     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2562     ii   = *Ccsr->row_offsets;
2563     jj   = *Ccsr->column_indices;
2564     if (ciscompressed) d_i = c->compressedrow.i;
2565     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2566     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2567   } else {
2568     PetscInt *d_i = c->i;
2569     if (ciscompressed) d_i = c->compressedrow.i;
2570     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2571     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2572   }
2573   if (ciscompressed) { /* need to expand host row offsets */
2574     PetscInt r = 0;
2575     c->i[0] = 0;
2576     for (k = 0; k < c->compressedrow.nrows; k++) {
2577       const PetscInt next = c->compressedrow.rindex[k];
2578       const PetscInt old = c->compressedrow.i[k];
2579       for (; r < next; r++) c->i[r+1] = old;
2580     }
2581     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2582   }
2583   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2584   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2585   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2586   c->maxnz = c->nz;
2587   c->nonzerorowcnt = 0;
2588   c->rmax = 0;
2589   for (k = 0; k < m; k++) {
2590     const PetscInt nn = c->i[k+1] - c->i[k];
2591     c->ilen[k] = c->imax[k] = nn;
2592     c->nonzerorowcnt += (PetscInt)!!nn;
2593     c->rmax = PetscMax(c->rmax,nn);
2594   }
2595   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2596   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2597   Ccsr->num_entries = c->nz;
2598 
2599   C->nonzerostate++;
2600   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2601   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2602   Ccusp->nonzerostate = C->nonzerostate;
2603   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2604   C->preallocated  = PETSC_TRUE;
2605   C->assembled     = PETSC_FALSE;
2606   C->was_assembled = PETSC_FALSE;
2607   if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2608     mmdata->reusesym = PETSC_TRUE;
2609     C->offloadmask   = PETSC_OFFLOAD_GPU;
2610   }
2611   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2612   PetscFunctionReturn(0);
2613 }
2614 
2615 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2616 
2617 /* handles sparse or dense B */
2618 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2619 {
2620   Mat_Product    *product = mat->product;
2621   PetscErrorCode ierr;
2622   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2623 
2624   PetscFunctionBegin;
2625   MatCheckProduct(mat,1);
2626   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2627   if (!product->B->boundtocpu) {
2628     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2629   }
2630   if (product->type == MATPRODUCT_ABC) {
2631     Ciscusp = PETSC_FALSE;
2632     if (!product->C->boundtocpu) {
2633       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2634     }
2635   }
2636   if (isdense) {
2637     switch (product->type) {
2638     case MATPRODUCT_AB:
2639     case MATPRODUCT_AtB:
2640     case MATPRODUCT_ABt:
2641     case MATPRODUCT_PtAP:
2642     case MATPRODUCT_RARt:
2643      if (product->A->boundtocpu) {
2644         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2645       } else {
2646         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2647       }
2648       break;
2649     case MATPRODUCT_ABC:
2650       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2651       break;
2652     default:
2653       break;
2654     }
2655   } else if (Biscusp && Ciscusp) {
2656     switch (product->type) {
2657     case MATPRODUCT_AB:
2658     case MATPRODUCT_AtB:
2659     case MATPRODUCT_ABt:
2660       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2661       break;
2662     case MATPRODUCT_PtAP:
2663     case MATPRODUCT_RARt:
2664     case MATPRODUCT_ABC:
2665       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2666       break;
2667     default:
2668       break;
2669     }
2670   } else { /* fallback for AIJ */
2671     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2672   }
2673   PetscFunctionReturn(0);
2674 }
2675 
2676 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2677 {
2678   PetscErrorCode ierr;
2679 
2680   PetscFunctionBegin;
2681   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2682   PetscFunctionReturn(0);
2683 }
2684 
2685 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2686 {
2687   PetscErrorCode ierr;
2688 
2689   PetscFunctionBegin;
2690   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2691   PetscFunctionReturn(0);
2692 }
2693 
2694 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2695 {
2696   PetscErrorCode ierr;
2697 
2698   PetscFunctionBegin;
2699   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2700   PetscFunctionReturn(0);
2701 }
2702 
2703 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2704 {
2705   PetscErrorCode ierr;
2706 
2707   PetscFunctionBegin;
2708   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2709   PetscFunctionReturn(0);
2710 }
2711 
2712 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2713 {
2714   PetscErrorCode ierr;
2715 
2716   PetscFunctionBegin;
2717   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2718   PetscFunctionReturn(0);
2719 }
2720 
2721 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2722 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2723 {
2724   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2725   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2726   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2727   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2728   PetscErrorCode               ierr;
2729   cudaError_t                  cerr;
2730   cusparseStatus_t             stat;
2731   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2732   PetscBool                    compressed;
2733 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2734   PetscInt                     nx,ny;
2735 #endif
2736 
2737   PetscFunctionBegin;
2738   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2739   if (!a->nonzerorowcnt) {
2740     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2741     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2742     PetscFunctionReturn(0);
2743   }
2744   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2745   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2746   if (!trans) {
2747     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2748     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2749   } else {
2750     if (herm || !cusparsestruct->transgen) {
2751       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2752       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2753     } else {
2754       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2755       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2756     }
2757   }
2758   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2759   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2760 
2761   try {
2762     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2763     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2764     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2765 
2766     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2767     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2768       /* z = A x + beta y.
2769          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2770          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2771       */
2772       xptr = xarray;
2773       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2774       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2775      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2776       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2777           allocated to accommodate different uses. So we get the length info directly from mat.
2778        */
2779       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2780         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2781         nx = mat->num_cols;
2782         ny = mat->num_rows;
2783       }
2784      #endif
2785     } else {
2786       /* z = A^T x + beta y
2787          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2788          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2789        */
2790       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2791       dptr = zarray;
2792       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2793       if (compressed) { /* Scatter x to work vector */
2794         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2795         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2796                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2797                          VecCUDAEqualsReverse());
2798       }
2799      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2800       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2801         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2802         nx = mat->num_rows;
2803         ny = mat->num_cols;
2804       }
2805      #endif
2806     }
2807 
2808     /* csr_spmv does y = alpha op(A) x + beta y */
2809     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2810      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2811       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2812       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2813         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2814         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2815         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2816                                 matstruct->matDescr,
2817                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2818                                 matstruct->cuSpMV[opA].vecYDescr,
2819                                 cusparse_scalartype,
2820                                 cusparsestruct->spmvAlg,
2821                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2822         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2823 
2824         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2825       } else {
2826         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2827         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2828         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2829       }
2830 
2831       stat = cusparseSpMV(cusparsestruct->handle, opA,
2832                                matstruct->alpha_one,
2833                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2834                                matstruct->cuSpMV[opA].vecXDescr,
2835                                beta,
2836                                matstruct->cuSpMV[opA].vecYDescr,
2837                                cusparse_scalartype,
2838                                cusparsestruct->spmvAlg,
2839                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2840      #else
2841       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2842       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2843                                mat->num_rows, mat->num_cols,
2844                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2845                                mat->values->data().get(), mat->row_offsets->data().get(),
2846                                mat->column_indices->data().get(), xptr, beta,
2847                                dptr);CHKERRCUSPARSE(stat);
2848      #endif
2849     } else {
2850       if (cusparsestruct->nrows) {
2851        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2852         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2853        #else
2854         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2855         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2856                                  matstruct->alpha_one, matstruct->descr, hybMat,
2857                                  xptr, beta,
2858                                  dptr);CHKERRCUSPARSE(stat);
2859        #endif
2860       }
2861     }
2862     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2863     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2864 
2865     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2866       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2867         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2868           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2869         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2870           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2871         }
2872       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2873         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2874       }
2875 
2876       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2877       if (compressed) {
2878         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2879         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2880         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2881                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2882                          VecCUDAPlusEquals());
2883         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2884         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2885       }
2886     } else {
2887       if (yy && yy != zz) {
2888         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2889       }
2890     }
2891     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2892     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2893     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2894   } catch(char *ex) {
2895     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2896   }
2897   if (yy) {
2898     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2899   } else {
2900     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2901   }
2902   PetscFunctionReturn(0);
2903 }
2904 
2905 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2906 {
2907   PetscErrorCode ierr;
2908 
2909   PetscFunctionBegin;
2910   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2911   PetscFunctionReturn(0);
2912 }
2913 
2914 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2915 {
2916   PetscErrorCode              ierr;
2917   PetscSplitCSRDataStructure  *d_mat = NULL;
2918   PetscFunctionBegin;
2919   if (A->factortype == MAT_FACTOR_NONE) {
2920     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2921   }
2922   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2923   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2924   if (d_mat) {
2925     A->offloadmask = PETSC_OFFLOAD_GPU;
2926   }
2927 
2928   PetscFunctionReturn(0);
2929 }
2930 
2931 /* --------------------------------------------------------------------------------*/
2932 /*@
2933    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2934    (the default parallel PETSc format). This matrix will ultimately pushed down
2935    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2936    assembly performance the user should preallocate the matrix storage by setting
2937    the parameter nz (or the array nnz).  By setting these parameters accurately,
2938    performance during matrix assembly can be increased by more than a factor of 50.
2939 
2940    Collective
2941 
2942    Input Parameters:
2943 +  comm - MPI communicator, set to PETSC_COMM_SELF
2944 .  m - number of rows
2945 .  n - number of columns
2946 .  nz - number of nonzeros per row (same for all rows)
2947 -  nnz - array containing the number of nonzeros in the various rows
2948          (possibly different for each row) or NULL
2949 
2950    Output Parameter:
2951 .  A - the matrix
2952 
2953    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2954    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2955    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2956 
2957    Notes:
2958    If nnz is given then nz is ignored
2959 
2960    The AIJ format (also called the Yale sparse matrix format or
2961    compressed row storage), is fully compatible with standard Fortran 77
2962    storage.  That is, the stored row and column indices can begin at
2963    either one (as in Fortran) or zero.  See the users' manual for details.
2964 
2965    Specify the preallocated storage with either nz or nnz (not both).
2966    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2967    allocation.  For large problems you MUST preallocate memory or you
2968    will get TERRIBLE performance, see the users' manual chapter on matrices.
2969 
2970    By default, this format uses inodes (identical nodes) when possible, to
2971    improve numerical efficiency of matrix-vector products and solves. We
2972    search for consecutive rows with the same nonzero structure, thereby
2973    reusing matrix information to achieve increased efficiency.
2974 
2975    Level: intermediate
2976 
2977 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2978 @*/
2979 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2980 {
2981   PetscErrorCode ierr;
2982 
2983   PetscFunctionBegin;
2984   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2985   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2986   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2987   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2988   PetscFunctionReturn(0);
2989 }
2990 
2991 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2992 {
2993   PetscErrorCode              ierr;
2994   PetscSplitCSRDataStructure  *d_mat = NULL;
2995 
2996   PetscFunctionBegin;
2997   if (A->factortype == MAT_FACTOR_NONE) {
2998     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2999     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3000     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
3001   } else {
3002     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3003   }
3004   if (d_mat) {
3005     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3006     cudaError_t                err;
3007     PetscSplitCSRDataStructure h_mat;
3008     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3009     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3010     if (a->compressedrow.use) {
3011       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3012     }
3013     err = cudaFree(d_mat);CHKERRCUDA(err);
3014   }
3015   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3016   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3017   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3018   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3019   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3020   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3021   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3022   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3023   PetscFunctionReturn(0);
3024 }
3025 
3026 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3027 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3028 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3029 {
3030   PetscErrorCode ierr;
3031 
3032   PetscFunctionBegin;
3033   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3034   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3035   PetscFunctionReturn(0);
3036 }
3037 
3038 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
3039 {
3040   PetscErrorCode ierr;
3041   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3042   PetscBool      flgx,flgy;
3043 
3044   PetscFunctionBegin;
3045   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
3046   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
3047   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
3048   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
3049   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
3050   if (!flgx || !flgy) {
3051     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3052     PetscFunctionReturn(0);
3053   }
3054   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
3055   if (str == DIFFERENT_NONZERO_PATTERN) {
3056     if (x->nz == y->nz) {
3057       PetscBool e;
3058       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
3059       if (e) {
3060         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
3061         if (e) {
3062           str = SAME_NONZERO_PATTERN;
3063         }
3064       }
3065     }
3066   }
3067   if (str != SAME_NONZERO_PATTERN) {
3068     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3069     PetscFunctionReturn(0);
3070   } else {
3071     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3072     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
3073     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3074     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3075     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
3076       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
3077         ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3078       }
3079       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
3080         ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
3081       }
3082       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3083       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3084       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3085     } else {
3086       cublasHandle_t cublasv2handle;
3087       cublasStatus_t cberr;
3088       cudaError_t    err;
3089       PetscScalar    alpha = a;
3090       PetscBLASInt   one = 1, bnz = 1;
3091       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
3092       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
3093       PetscScalar    *aa_y, *aa_x;
3094       aa_y = matrix_y->values->data().get();
3095       aa_x = matrix_x->values->data().get();
3096       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3097       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3098       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3099       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
3100       err  = WaitForCUDA();CHKERRCUDA(err);
3101       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3102       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3103       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3104       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
3105       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
3106       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
3107       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3108     }
3109   }
3110   PetscFunctionReturn(0);
3111 }
3112 
3113 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3114 {
3115   PetscErrorCode             ierr;
3116   PetscBool                  both = PETSC_FALSE;
3117   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3118 
3119   PetscFunctionBegin;
3120   if (A->factortype == MAT_FACTOR_NONE) {
3121     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3122     if (spptr->mat) {
3123       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3124       if (matrix->values) {
3125         both = PETSC_TRUE;
3126         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3127       }
3128     }
3129     if (spptr->matTranspose) {
3130       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3131       if (matrix->values) {
3132         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3133       }
3134     }
3135   }
3136   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3137   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3138   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3139   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3140   else A->offloadmask = PETSC_OFFLOAD_CPU;
3141 
3142   PetscFunctionReturn(0);
3143 }
3144 
3145 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3146 {
3147   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3148   PetscErrorCode ierr;
3149 
3150   PetscFunctionBegin;
3151   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3152   if (flg) {
3153     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3154 
3155     A->ops->axpy                      = MatAXPY_SeqAIJ;
3156     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3157     A->ops->mult                      = MatMult_SeqAIJ;
3158     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3159     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3160     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3161     A->ops->multhermitiantranspose    = NULL;
3162     A->ops->multhermitiantransposeadd = NULL;
3163     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3164     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3165     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3166     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3167     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3168     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3169     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3170   } else {
3171     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3172     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3173     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3174     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3175     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3176     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3177     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3178     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3179     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3180     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3181     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3182     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3183     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3184     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3185     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3186   }
3187   A->boundtocpu = flg;
3188   a->inode.use = flg;
3189   PetscFunctionReturn(0);
3190 }
3191 
3192 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3193 {
3194   PetscErrorCode   ierr;
3195   cusparseStatus_t stat;
3196   Mat              B;
3197 
3198   PetscFunctionBegin;
3199   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3200   if (reuse == MAT_INITIAL_MATRIX) {
3201     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3202   } else if (reuse == MAT_REUSE_MATRIX) {
3203     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3204   }
3205   B = *newmat;
3206 
3207   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3208   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3209 
3210   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3211     if (B->factortype == MAT_FACTOR_NONE) {
3212       Mat_SeqAIJCUSPARSE *spptr;
3213 
3214       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3215       spptr->format = MAT_CUSPARSE_CSR;
3216       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3217       B->spptr = spptr;
3218       spptr->deviceMat = NULL;
3219     } else {
3220       Mat_SeqAIJCUSPARSETriFactors *spptr;
3221 
3222       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3223       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3224       B->spptr = spptr;
3225     }
3226     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3227   }
3228   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3229   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3230   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3231   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3232   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3233 
3234   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3235   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3236   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3237   PetscFunctionReturn(0);
3238 }
3239 
3240 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3241 {
3242   PetscErrorCode ierr;
3243 
3244   PetscFunctionBegin;
3245   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3246   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3247   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3248   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3249   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3250   PetscFunctionReturn(0);
3251 }
3252 
3253 /*MC
3254    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3255 
3256    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3257    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3258    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3259 
3260    Options Database Keys:
3261 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3262 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3263 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3264 
3265   Level: beginner
3266 
3267 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3268 M*/
3269 
3270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3271 
3272 
3273 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3274 {
3275   PetscErrorCode ierr;
3276 
3277   PetscFunctionBegin;
3278   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3279   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3280   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3281   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3282   PetscFunctionReturn(0);
3283 }
3284 
3285 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3286 {
3287   PetscErrorCode   ierr;
3288   cusparseStatus_t stat;
3289 
3290   PetscFunctionBegin;
3291   if (*cusparsestruct) {
3292     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3293     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3294     delete (*cusparsestruct)->workVector;
3295     delete (*cusparsestruct)->rowoffsets_gpu;
3296     delete (*cusparsestruct)->cooPerm;
3297     delete (*cusparsestruct)->cooPerm_a;
3298     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3299    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3300     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3301    #endif
3302     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3303   }
3304   PetscFunctionReturn(0);
3305 }
3306 
3307 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3308 {
3309   PetscFunctionBegin;
3310   if (*mat) {
3311     delete (*mat)->values;
3312     delete (*mat)->column_indices;
3313     delete (*mat)->row_offsets;
3314     delete *mat;
3315     *mat = 0;
3316   }
3317   PetscFunctionReturn(0);
3318 }
3319 
3320 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3321 {
3322   cusparseStatus_t stat;
3323   PetscErrorCode   ierr;
3324 
3325   PetscFunctionBegin;
3326   if (*trifactor) {
3327     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3328     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3329     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3330     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3331     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3332    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3333     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3334    #endif
3335     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3336   }
3337   PetscFunctionReturn(0);
3338 }
3339 
3340 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3341 {
3342   CsrMatrix        *mat;
3343   cusparseStatus_t stat;
3344   cudaError_t      err;
3345 
3346   PetscFunctionBegin;
3347   if (*matstruct) {
3348     if ((*matstruct)->mat) {
3349       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3350        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3351         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3352        #else
3353         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3354         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3355        #endif
3356       } else {
3357         mat = (CsrMatrix*)(*matstruct)->mat;
3358         CsrMatrix_Destroy(&mat);
3359       }
3360     }
3361     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3362     delete (*matstruct)->cprowIndices;
3363     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3364     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3365     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3366 
3367    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3368     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3369     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3370     for (int i=0; i<3; i++) {
3371       if (mdata->cuSpMV[i].initialized) {
3372         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3373         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3374         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3375       }
3376     }
3377    #endif
3378     delete *matstruct;
3379     *matstruct = NULL;
3380   }
3381   PetscFunctionReturn(0);
3382 }
3383 
3384 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3385 {
3386   PetscErrorCode ierr;
3387 
3388   PetscFunctionBegin;
3389   if (*trifactors) {
3390     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3391     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3392     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3393     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3394     delete (*trifactors)->rpermIndices;
3395     delete (*trifactors)->cpermIndices;
3396     delete (*trifactors)->workVector;
3397     (*trifactors)->rpermIndices = NULL;
3398     (*trifactors)->cpermIndices = NULL;
3399     (*trifactors)->workVector = NULL;
3400   }
3401   PetscFunctionReturn(0);
3402 }
3403 
3404 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3405 {
3406   PetscErrorCode   ierr;
3407   cusparseHandle_t handle;
3408   cusparseStatus_t stat;
3409 
3410   PetscFunctionBegin;
3411   if (*trifactors) {
3412     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3413     if (handle = (*trifactors)->handle) {
3414       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3415     }
3416     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3417   }
3418   PetscFunctionReturn(0);
3419 }
3420 
3421 struct IJCompare
3422 {
3423   __host__ __device__
3424   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3425   {
3426     if (t1.get<0>() < t2.get<0>()) return true;
3427     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3428     return false;
3429   }
3430 };
3431 
3432 struct IJEqual
3433 {
3434   __host__ __device__
3435   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3436   {
3437     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3438     return true;
3439   }
3440 };
3441 
3442 struct IJDiff
3443 {
3444   __host__ __device__
3445   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3446   {
3447     return t1 == t2 ? 0 : 1;
3448   }
3449 };
3450 
3451 struct IJSum
3452 {
3453   __host__ __device__
3454   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3455   {
3456     return t1||t2;
3457   }
3458 };
3459 
3460 #include <thrust/iterator/discard_iterator.h>
3461 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3462 {
3463   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3464   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3465   THRUSTARRAY                           *cooPerm_v = NULL,*cooPerm_w;
3466   thrust::device_ptr<const PetscScalar> d_v;
3467   CsrMatrix                             *matrix;
3468   PetscErrorCode                        ierr;
3469   cudaError_t                           cerr;
3470   PetscInt                              n;
3471 
3472   PetscFunctionBegin;
3473   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3474   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3475   if (!cusp->cooPerm) {
3476     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3477     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3478     PetscFunctionReturn(0);
3479   }
3480   matrix = (CsrMatrix*)cusp->mat->mat;
3481   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3482   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3483   if (!v) {
3484     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3485     goto finalize;
3486   }
3487   n = cusp->cooPerm->size();
3488   if (isCudaMem(v)) {
3489     d_v = thrust::device_pointer_cast(v);
3490   } else {
3491     cooPerm_v = new THRUSTARRAY(n);
3492     cooPerm_v->assign(v,v+n);
3493     d_v = cooPerm_v->data();
3494     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3495   }
3496   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3497     if (cusp->cooPerm_a) {
3498       cooPerm_w = new THRUSTARRAY(matrix->values->size());
3499       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3500       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3501       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3502       delete cooPerm_w;
3503     } else {
3504       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3505                                                                 matrix->values->begin()));
3506       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3507                                                                 matrix->values->end()));
3508       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3509     }
3510   } else {
3511     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3512       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3513       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3514     } else {
3515       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3516                                                                 matrix->values->begin()));
3517       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3518                                                                 matrix->values->end()));
3519       thrust::for_each(zibit,zieit,VecCUDAEquals());
3520     }
3521   }
3522   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3523   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3524 finalize:
3525   delete cooPerm_v;
3526   A->offloadmask = PETSC_OFFLOAD_GPU;
3527   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3528   /* shorter version of MatAssemblyEnd_SeqAIJ */
3529   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3530   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3531   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3532   a->reallocs         = 0;
3533   A->info.mallocs    += 0;
3534   A->info.nz_unneeded = 0;
3535   A->assembled = A->was_assembled = PETSC_TRUE;
3536   A->num_ass++;
3537   PetscFunctionReturn(0);
3538 }
3539 
3540 #include <thrust/binary_search.h>
3541 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3542 {
3543   PetscErrorCode     ierr;
3544   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3545   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3546   PetscInt           cooPerm_n, nzr = 0;
3547   cudaError_t        cerr;
3548 
3549   PetscFunctionBegin;
3550   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3551   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3552   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3553   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3554   if (n != cooPerm_n) {
3555     delete cusp->cooPerm;
3556     delete cusp->cooPerm_a;
3557     cusp->cooPerm = NULL;
3558     cusp->cooPerm_a = NULL;
3559   }
3560   if (n) {
3561     THRUSTINTARRAY d_i(n);
3562     THRUSTINTARRAY d_j(n);
3563     THRUSTINTARRAY ii(A->rmap->n);
3564 
3565     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3566     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3567 
3568     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3569     d_i.assign(coo_i,coo_i+n);
3570     d_j.assign(coo_j,coo_j+n);
3571     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3572     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3573 
3574     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3575     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3576     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3577     *cusp->cooPerm_a = d_i;
3578     THRUSTINTARRAY w = d_j;
3579 
3580     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3581     if (nekey == ekey) { /* all entries are unique */
3582       delete cusp->cooPerm_a;
3583       cusp->cooPerm_a = NULL;
3584     } else { /* I couldn't come up with a more elegant algorithm */
3585       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3586       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3587       (*cusp->cooPerm_a)[0] = 0;
3588       w[0] = 0;
3589       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3590       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3591     }
3592     thrust::counting_iterator<PetscInt> search_begin(0);
3593     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3594                         search_begin, search_begin + A->rmap->n,
3595                         ii.begin());
3596     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3597     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3598 
3599     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3600     a->singlemalloc = PETSC_FALSE;
3601     a->free_a       = PETSC_TRUE;
3602     a->free_ij      = PETSC_TRUE;
3603     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3604     a->i[0] = 0;
3605     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3606     a->nz = a->maxnz = a->i[A->rmap->n];
3607     a->rmax = 0;
3608     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3609     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3610     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3611     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3612     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3613     for (PetscInt i = 0; i < A->rmap->n; i++) {
3614       const PetscInt nnzr = a->i[i+1] - a->i[i];
3615       nzr += (PetscInt)!!(nnzr);
3616       a->ilen[i] = a->imax[i] = nnzr;
3617       a->rmax = PetscMax(a->rmax,nnzr);
3618     }
3619     a->nonzerorowcnt = nzr;
3620     A->preallocated = PETSC_TRUE;
3621     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3622     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3623   } else {
3624     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3625   }
3626   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3627   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3628 
3629   /* We want to allocate the CUSPARSE struct for matvec now.
3630      The code is so convoluted now that I prefer to copy zeros */
3631   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3632   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3633   A->offloadmask = PETSC_OFFLOAD_CPU;
3634   A->nonzerostate++;
3635   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3636   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3637 
3638   A->assembled = PETSC_FALSE;
3639   A->was_assembled = PETSC_FALSE;
3640   PetscFunctionReturn(0);
3641 }
3642 
3643 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3644 {
3645   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3646   CsrMatrix          *csr;
3647   PetscErrorCode     ierr;
3648 
3649   PetscFunctionBegin;
3650   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3651   PetscValidPointer(a,2);
3652   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3653   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3654   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3655   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3656   csr = (CsrMatrix*)cusp->mat->mat;
3657   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3658   *a = csr->values->data().get();
3659   PetscFunctionReturn(0);
3660 }
3661 
3662 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3663 {
3664   PetscFunctionBegin;
3665   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3666   PetscValidPointer(a,2);
3667   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3668   *a = NULL;
3669   PetscFunctionReturn(0);
3670 }
3671 
3672 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3673 {
3674   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3675   CsrMatrix          *csr;
3676 
3677   PetscFunctionBegin;
3678   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3679   PetscValidPointer(a,2);
3680   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3681   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3682   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3683   csr = (CsrMatrix*)cusp->mat->mat;
3684   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3685   *a = csr->values->data().get();
3686   PetscFunctionReturn(0);
3687 }
3688 
3689 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3690 {
3691   PetscErrorCode ierr;
3692 
3693   PetscFunctionBegin;
3694   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3695   PetscValidPointer(a,2);
3696   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3697   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3698   A->offloadmask = PETSC_OFFLOAD_GPU;
3699   *a = NULL;
3700   PetscFunctionReturn(0);
3701 }
3702 
3703 struct IJCompare4
3704 {
3705   __host__ __device__
3706   inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2)
3707   {
3708     if (t1.get<0>() < t2.get<0>()) return true;
3709     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3710     return false;
3711   }
3712 };
3713 
3714 struct Shift
3715 {
3716   int _shift;
3717 
3718   Shift(int shift) : _shift(shift) {}
3719   __host__ __device__
3720   inline int operator() (const int &c)
3721   {
3722     return c + _shift;
3723   }
3724 };
3725 
3726 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3727 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3728 {
3729   PetscErrorCode               ierr;
3730   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3731   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3732   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3733   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3734   PetscInt                     Annz,Bnnz;
3735   cusparseStatus_t             stat;
3736   PetscInt                     i,m,n,zero = 0;
3737   cudaError_t                  cerr;
3738 
3739   PetscFunctionBegin;
3740   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3741   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3742   PetscValidPointer(C,4);
3743   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3744   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3745   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3746   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3747   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3748   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3749   if (reuse == MAT_INITIAL_MATRIX) {
3750     m     = A->rmap->n;
3751     n     = A->cmap->n + B->cmap->n;
3752     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3753     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3754     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3755     c     = (Mat_SeqAIJ*)(*C)->data;
3756     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3757     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3758     Ccsr  = new CsrMatrix;
3759     Cmat->cprowIndices      = NULL;
3760     c->compressedrow.use    = PETSC_FALSE;
3761     c->compressedrow.nrows  = 0;
3762     c->compressedrow.i      = NULL;
3763     c->compressedrow.rindex = NULL;
3764     Ccusp->workVector       = NULL;
3765     Ccusp->nrows    = m;
3766     Ccusp->mat      = Cmat;
3767     Ccusp->mat->mat = Ccsr;
3768     Ccsr->num_rows  = m;
3769     Ccsr->num_cols  = n;
3770     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3771     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3772     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3773     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3774     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3775     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3776     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3777     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3778     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3779     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3780     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3781     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3782     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3783     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3784     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3785 
3786     Acsr = (CsrMatrix*)Acusp->mat->mat;
3787     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3788     Annz = (PetscInt)Acsr->column_indices->size();
3789     Bnnz = (PetscInt)Bcsr->column_indices->size();
3790     c->nz = Annz + Bnnz;
3791     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3792     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3793     Ccsr->values = new THRUSTARRAY(c->nz);
3794     Ccsr->num_entries = c->nz;
3795     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3796 
3797     if (c->nz) {
3798       THRUSTINTARRAY32 Acoo(Annz);
3799       THRUSTINTARRAY32 Bcoo(Bnnz);
3800       THRUSTINTARRAY32 *roff;
3801       if (a->compressedrow.use) { /* need full row offset */
3802         if (!Acusp->rowoffsets_gpu) {
3803           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3804           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3805           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3806         }
3807         roff = Acusp->rowoffsets_gpu;
3808       } else roff = Acsr->row_offsets;
3809       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3810       stat = cusparseXcsr2coo(Acusp->handle,
3811                               roff->data().get(),
3812                               Annz,
3813                               m,
3814                               Acoo.data().get(),
3815                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3816       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3817       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3818       if (b->compressedrow.use) { /* need full row offset */
3819         if (!Bcusp->rowoffsets_gpu) {
3820           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3821           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3822           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3823         }
3824         roff = Bcusp->rowoffsets_gpu;
3825       } else roff = Bcsr->row_offsets;
3826       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3827       stat = cusparseXcsr2coo(Bcusp->handle,
3828                               roff->data().get(),
3829                               Bnnz,
3830                               m,
3831                               Bcoo.data().get(),
3832                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3833       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3834       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3835       THRUSTINTARRAY32 Ccoo(c->nz);
3836       auto Aperm = thrust::make_constant_iterator(true);
3837       auto Bperm = thrust::make_constant_iterator(false);
3838       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3839       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3840       thrust::device_vector<bool> wPerm(Annz+Bnnz);
3841       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3842       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3843       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm));
3844       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm));
3845       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin()));
3846       auto p1 = Ccusp->cooPerm->begin();
3847       auto p2 = Ccusp->cooPerm->begin();
3848       thrust::advance(p2,Annz);
3849       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3850       thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4());
3851       thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>());
3852       stat = cusparseXcoo2csr(Ccusp->handle,
3853                               Ccoo.data().get(),
3854                               c->nz,
3855                               m,
3856                               Ccsr->row_offsets->data().get(),
3857                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3858       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3859       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3860 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3861       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3862                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3863                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3864                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3865 #endif
3866       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3867         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3868         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3869         CsrMatrix *CcsrT = new CsrMatrix;
3870         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3871         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3872 
3873         Ccusp->transgen = PETSC_TRUE;
3874         CmatT->cprowIndices  = NULL;
3875         CmatT->mat = CcsrT;
3876         CcsrT->num_rows = n;
3877         CcsrT->num_cols = m;
3878         CcsrT->num_entries = c->nz;
3879 
3880         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3881         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3882         CcsrT->values = new THRUSTARRAY(c->nz);
3883 
3884         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3885         auto rT = CcsrT->row_offsets->begin();
3886         if (AT) {
3887           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3888           thrust::advance(rT,-1);
3889         }
3890         if (BT) {
3891           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3892           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3893           thrust::copy(titb,tite,rT);
3894         }
3895         auto cT = CcsrT->column_indices->begin();
3896         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3897         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3898         auto vT = CcsrT->values->begin();
3899         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3900         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3901         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3902         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3903 
3904         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3905         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3906         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3907         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3908         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3909         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3910         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3911         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3912         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3913 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3914         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3915                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3916                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3917                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3918 #endif
3919         Ccusp->matTranspose = CmatT;
3920       }
3921     }
3922 
3923     c->singlemalloc = PETSC_FALSE;
3924     c->free_a       = PETSC_TRUE;
3925     c->free_ij      = PETSC_TRUE;
3926     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
3927     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
3928     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3929       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3930       THRUSTINTARRAY jj(Ccsr->column_indices->size());
3931       ii   = *Ccsr->row_offsets;
3932       jj   = *Ccsr->column_indices;
3933       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3934       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3935     } else {
3936       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3937       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3938     }
3939     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
3940     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
3941     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
3942     c->maxnz = c->nz;
3943     c->nonzerorowcnt = 0;
3944     c->rmax = 0;
3945     for (i = 0; i < m; i++) {
3946       const PetscInt nn = c->i[i+1] - c->i[i];
3947       c->ilen[i] = c->imax[i] = nn;
3948       c->nonzerorowcnt += (PetscInt)!!nn;
3949       c->rmax = PetscMax(c->rmax,nn);
3950     }
3951     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
3952     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
3953     (*C)->nonzerostate++;
3954     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
3955     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
3956     Ccusp->nonzerostate = (*C)->nonzerostate;
3957     (*C)->preallocated  = PETSC_TRUE;
3958   } else {
3959     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
3960     c = (Mat_SeqAIJ*)(*C)->data;
3961     if (c->nz) {
3962       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3963       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
3964       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3965       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
3966       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3967       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3968       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3969       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3970       Acsr = (CsrMatrix*)Acusp->mat->mat;
3971       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3972       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
3973       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
3974       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
3975       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
3976       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
3977       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
3978       auto pmid = Ccusp->cooPerm->begin();
3979       thrust::advance(pmid,Acsr->num_entries);
3980       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3981       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
3982                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
3983       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
3984                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3985       thrust::for_each(zibait,zieait,VecCUDAEquals());
3986       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
3987                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3988       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
3989                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
3990       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
3991       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
3992         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3993         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3994         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3995         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3996         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
3997         auto vT = CcsrT->values->begin();
3998         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3999         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4000       }
4001       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4002       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4003     }
4004   }
4005   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4006   (*C)->assembled     = PETSC_TRUE;
4007   (*C)->was_assembled = PETSC_FALSE;
4008   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4009   PetscFunctionReturn(0);
4010 }
4011