xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2e5835c6908701226356db06d805ef70170dbd55)
1 /*
2   Defines the basic matrix operations for the AIJ (compressed row)
3   matrix storage format using the CUSPARSE library,
4 */
5 #define PETSC_SKIP_SPINLOCK
6 #define PETSC_SKIP_CXX_COMPLEX_FIX
7 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8 
9 #include <petscconf.h>
10 #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11 #include <../src/mat/impls/sbaij/seq/sbaij.h>
12 #include <../src/vec/vec/impls/dvecimpl.h>
13 #include <petsc/private/vecimpl.h>
14 #undef VecType
15 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16 
17 const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21 
22   typedef enum {
23       CUSPARSE_MV_ALG_DEFAULT = 0,
24       CUSPARSE_COOMV_ALG      = 1,
25       CUSPARSE_CSRMV_ALG1     = 2,
26       CUSPARSE_CSRMV_ALG2     = 3
27   } cusparseSpMVAlg_t;
28 
29   typedef enum {
30       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36       CUSPARSE_SPMM_COO_ALG1    = 1,
37       CUSPARSE_SPMM_COO_ALG2    = 2,
38       CUSPARSE_SPMM_COO_ALG3    = 3,
39       CUSPARSE_SPMM_COO_ALG4    = 5,
40       CUSPARSE_SPMM_CSR_ALG1    = 4,
41       CUSPARSE_SPMM_CSR_ALG2    = 6,
42   } cusparseSpMMAlg_t;
43 
44   typedef enum {
45       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47   } cusparseCsr2CscAlg_t;
48   */
49   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52 #endif
53 
54 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57 
58 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
59 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
60 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61 
62 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
63 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
64 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
65 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
66 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
68 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
69 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
70 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
71 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
75 
76 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
82 
83 PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
84 PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
85 
86 PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
87 {
88   cusparseStatus_t   stat;
89   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
90 
91   PetscFunctionBegin;
92   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
93   cusparsestruct->stream = stream;
94   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
95   PetscFunctionReturn(0);
96 }
97 
98 PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
99 {
100   cusparseStatus_t   stat;
101   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102 
103   PetscFunctionBegin;
104   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105   if (cusparsestruct->handle != handle) {
106     if (cusparsestruct->handle) {
107       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
108     }
109     cusparsestruct->handle = handle;
110   }
111   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
112   PetscFunctionReturn(0);
113 }
114 
115 PetscErrorCode MatCUSPARSEClearHandle(Mat A)
116 {
117   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
118   PetscBool          flg;
119   PetscErrorCode     ierr;
120 
121   PetscFunctionBegin;
122   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
123   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
124   if (cusparsestruct->handle) cusparsestruct->handle = 0;
125   PetscFunctionReturn(0);
126 }
127 
128 PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
129 {
130   PetscFunctionBegin;
131   *type = MATSOLVERCUSPARSE;
132   PetscFunctionReturn(0);
133 }
134 
135 /*MC
136   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
137   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
138   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
139   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
140   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
141   algorithms are not recommended. This class does NOT support direct solver operations.
142 
143   Level: beginner
144 
145 .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
146 M*/
147 
148 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
149 {
150   PetscErrorCode ierr;
151   PetscInt       n = A->rmap->n;
152 
153   PetscFunctionBegin;
154   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
155   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
156   (*B)->factortype = ftype;
157   (*B)->useordering = PETSC_TRUE;
158   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
159 
160   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
161     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
162     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
163     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
164   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
165     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
166     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
167   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
168 
169   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
170   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
171   PetscFunctionReturn(0);
172 }
173 
174 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
175 {
176   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
177 
178   PetscFunctionBegin;
179   switch (op) {
180   case MAT_CUSPARSE_MULT:
181     cusparsestruct->format = format;
182     break;
183   case MAT_CUSPARSE_ALL:
184     cusparsestruct->format = format;
185     break;
186   default:
187     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
188   }
189   PetscFunctionReturn(0);
190 }
191 
192 /*@
193    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
194    operation. Only the MatMult operation can use different GPU storage formats
195    for MPIAIJCUSPARSE matrices.
196    Not Collective
197 
198    Input Parameters:
199 +  A - Matrix of type SEQAIJCUSPARSE
200 .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
201 -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
202 
203    Output Parameter:
204 
205    Level: intermediate
206 
207 .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
208 @*/
209 PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
210 {
211   PetscErrorCode ierr;
212 
213   PetscFunctionBegin;
214   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
215   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
216   PetscFunctionReturn(0);
217 }
218 
219 /*@
220    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
221 
222    Collective on mat
223 
224    Input Parameters:
225 +  A - Matrix of type SEQAIJCUSPARSE
226 -  transgen - the boolean flag
227 
228    Level: intermediate
229 
230 .seealso: MATSEQAIJCUSPARSE
231 @*/
232 PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
233 {
234   PetscErrorCode ierr;
235   PetscBool      flg;
236 
237   PetscFunctionBegin;
238   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
239   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
240   if (flg) {
241     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
242 
243     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
244     cusp->transgen = transgen;
245     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
246       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
247     }
248   }
249   PetscFunctionReturn(0);
250 }
251 
252 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
253 {
254   PetscErrorCode           ierr;
255   MatCUSPARSEStorageFormat format;
256   PetscBool                flg;
257   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
258 
259   PetscFunctionBegin;
260   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
261   if (A->factortype == MAT_FACTOR_NONE) {
262     PetscBool transgen = cusparsestruct->transgen;
263 
264     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
265     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
266 
267     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
268                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
269     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
270 
271     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
272                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
273     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
274    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
275     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
276     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
277                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
278     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
279     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
280 
281     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
282     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
283                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
284     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
285 
286     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
287     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
288                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
289     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
290    #endif
291   }
292   ierr = PetscOptionsTail();CHKERRQ(ierr);
293   PetscFunctionReturn(0);
294 }
295 
296 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
297 {
298   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
299   PetscErrorCode               ierr;
300 
301   PetscFunctionBegin;
302   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
303   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
304   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
305   PetscFunctionReturn(0);
306 }
307 
308 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
309 {
310   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
311   PetscErrorCode               ierr;
312 
313   PetscFunctionBegin;
314   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
315   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
316   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
317   PetscFunctionReturn(0);
318 }
319 
320 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
321 {
322   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323   PetscErrorCode               ierr;
324 
325   PetscFunctionBegin;
326   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
328   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
329   PetscFunctionReturn(0);
330 }
331 
332 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
333 {
334   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335   PetscErrorCode               ierr;
336 
337   PetscFunctionBegin;
338   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
340   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
341   PetscFunctionReturn(0);
342 }
343 
344 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
345 {
346   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
347   PetscInt                          n = A->rmap->n;
348   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
350   cusparseStatus_t                  stat;
351   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
352   const MatScalar                   *aa = a->a,*v;
353   PetscInt                          *AiLo, *AjLo;
354   PetscInt                          i,nz, nzLower, offset, rowOffset;
355   PetscErrorCode                    ierr;
356   cudaError_t                       cerr;
357 
358   PetscFunctionBegin;
359   if (!n) PetscFunctionReturn(0);
360   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
361     try {
362       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
363       nzLower=n+ai[n]-ai[1];
364       if (!loTriFactor) {
365         PetscScalar                       *AALo;
366 
367         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
368 
369         /* Allocate Space for the lower triangular matrix */
370         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
371         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
372 
373         /* Fill the lower triangular matrix */
374         AiLo[0]  = (PetscInt) 0;
375         AiLo[n]  = nzLower;
376         AjLo[0]  = (PetscInt) 0;
377         AALo[0]  = (MatScalar) 1.0;
378         v        = aa;
379         vi       = aj;
380         offset   = 1;
381         rowOffset= 1;
382         for (i=1; i<n; i++) {
383           nz = ai[i+1] - ai[i];
384           /* additional 1 for the term on the diagonal */
385           AiLo[i]    = rowOffset;
386           rowOffset += nz+1;
387 
388           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
389           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
390 
391           offset      += nz;
392           AjLo[offset] = (PetscInt) i;
393           AALo[offset] = (MatScalar) 1.0;
394           offset      += 1;
395 
396           v  += nz;
397           vi += nz;
398         }
399 
400         /* allocate space for the triangular factor information */
401         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
402         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
403         /* Create the matrix description */
404         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
405         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
406        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
407         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
408        #else
409         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
410        #endif
411         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
412         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
413 
414         /* set the operation */
415         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
416 
417         /* set the matrix */
418         loTriFactor->csrMat = new CsrMatrix;
419         loTriFactor->csrMat->num_rows = n;
420         loTriFactor->csrMat->num_cols = n;
421         loTriFactor->csrMat->num_entries = nzLower;
422 
423         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
424         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
425 
426         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
427         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
428 
429         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
430         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
431 
432         /* Create the solve analysis information */
433         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
434         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
435       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
436         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
437                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
438                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
439                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
440                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
441         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
442       #endif
443 
444         /* perform the solve analysis */
445         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
446                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
447                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
448                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
449                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
450                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
451                                #endif
452 );CHKERRCUSPARSE(stat);
453         cerr = WaitForCUDA();CHKERRCUDA(cerr);
454         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455 
456         /* assign the pointer */
457         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
458         loTriFactor->AA_h = AALo;
459         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
460         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
461         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
462       } else { /* update values only */
463         if (!loTriFactor->AA_h) {
464           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
465         }
466         /* Fill the lower triangular matrix */
467         loTriFactor->AA_h[0]  = 1.0;
468         v        = aa;
469         vi       = aj;
470         offset   = 1;
471         for (i=1; i<n; i++) {
472           nz = ai[i+1] - ai[i];
473           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
474           offset      += nz;
475           loTriFactor->AA_h[offset] = 1.0;
476           offset      += 1;
477           v  += nz;
478         }
479         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
480         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
481       }
482     } catch(char *ex) {
483       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
484     }
485   }
486   PetscFunctionReturn(0);
487 }
488 
489 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
490 {
491   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
492   PetscInt                          n = A->rmap->n;
493   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
494   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
495   cusparseStatus_t                  stat;
496   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
497   const MatScalar                   *aa = a->a,*v;
498   PetscInt                          *AiUp, *AjUp;
499   PetscInt                          i,nz, nzUpper, offset;
500   PetscErrorCode                    ierr;
501   cudaError_t                       cerr;
502 
503   PetscFunctionBegin;
504   if (!n) PetscFunctionReturn(0);
505   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
506     try {
507       /* next, figure out the number of nonzeros in the upper triangular matrix. */
508       nzUpper = adiag[0]-adiag[n];
509       if (!upTriFactor) {
510         PetscScalar *AAUp;
511 
512         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
513 
514         /* Allocate Space for the upper triangular matrix */
515         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
516         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
517 
518         /* Fill the upper triangular matrix */
519         AiUp[0]=(PetscInt) 0;
520         AiUp[n]=nzUpper;
521         offset = nzUpper;
522         for (i=n-1; i>=0; i--) {
523           v  = aa + adiag[i+1] + 1;
524           vi = aj + adiag[i+1] + 1;
525 
526           /* number of elements NOT on the diagonal */
527           nz = adiag[i] - adiag[i+1]-1;
528 
529           /* decrement the offset */
530           offset -= (nz+1);
531 
532           /* first, set the diagonal elements */
533           AjUp[offset] = (PetscInt) i;
534           AAUp[offset] = (MatScalar)1./v[nz];
535           AiUp[i]      = AiUp[i+1] - (nz+1);
536 
537           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
538           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
539         }
540 
541         /* allocate space for the triangular factor information */
542         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
543         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
544 
545         /* Create the matrix description */
546         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
547         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
548        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
549         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
550        #else
551         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
552        #endif
553         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
554         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
555 
556         /* set the operation */
557         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558 
559         /* set the matrix */
560         upTriFactor->csrMat = new CsrMatrix;
561         upTriFactor->csrMat->num_rows = n;
562         upTriFactor->csrMat->num_cols = n;
563         upTriFactor->csrMat->num_entries = nzUpper;
564 
565         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
566         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
567 
568         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
569         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
570 
571         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
572         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
573 
574         /* Create the solve analysis information */
575         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
576         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
577       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
578         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
579                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
580                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
581                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
582                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
583         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
584       #endif
585 
586         /* perform the solve analysis */
587         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
588                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
589                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
590                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
591                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
592                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
593                                #endif
594 );CHKERRCUSPARSE(stat);
595         cerr = WaitForCUDA();CHKERRCUDA(cerr);
596         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
597 
598         /* assign the pointer */
599         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
600         upTriFactor->AA_h = AAUp;
601         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
602         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
603         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
604       } else {
605         if (!upTriFactor->AA_h) {
606           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
607         }
608         /* Fill the upper triangular matrix */
609         offset = nzUpper;
610         for (i=n-1; i>=0; i--) {
611           v  = aa + adiag[i+1] + 1;
612 
613           /* number of elements NOT on the diagonal */
614           nz = adiag[i] - adiag[i+1]-1;
615 
616           /* decrement the offset */
617           offset -= (nz+1);
618 
619           /* first, set the diagonal elements */
620           upTriFactor->AA_h[offset] = 1./v[nz];
621           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
622         }
623         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
624         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
625       }
626     } catch(char *ex) {
627       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
628     }
629   }
630   PetscFunctionReturn(0);
631 }
632 
633 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
634 {
635   PetscErrorCode               ierr;
636   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
637   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
638   IS                           isrow = a->row,iscol = a->icol;
639   PetscBool                    row_identity,col_identity;
640   PetscInt                     n = A->rmap->n;
641 
642   PetscFunctionBegin;
643   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
644   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
645   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
646 
647   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
648   cusparseTriFactors->nnz=a->nz;
649 
650   A->offloadmask = PETSC_OFFLOAD_BOTH;
651   /* lower triangular indices */
652   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
653   if (!row_identity && !cusparseTriFactors->rpermIndices) {
654     const PetscInt *r;
655 
656     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
657     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658     cusparseTriFactors->rpermIndices->assign(r, r+n);
659     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
660     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
661   }
662 
663   /* upper triangular indices */
664   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
665   if (!col_identity && !cusparseTriFactors->cpermIndices) {
666     const PetscInt *c;
667 
668     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
669     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670     cusparseTriFactors->cpermIndices->assign(c, c+n);
671     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
672     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
673   }
674   PetscFunctionReturn(0);
675 }
676 
677 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
678 {
679   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
680   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
681   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
682   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
683   cusparseStatus_t                  stat;
684   PetscErrorCode                    ierr;
685   cudaError_t                       cerr;
686   PetscInt                          *AiUp, *AjUp;
687   PetscScalar                       *AAUp;
688   PetscScalar                       *AALo;
689   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
690   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
691   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
692   const MatScalar                   *aa = b->a,*v;
693 
694   PetscFunctionBegin;
695   if (!n) PetscFunctionReturn(0);
696   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
697     try {
698       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
699       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
700       if (!upTriFactor && !loTriFactor) {
701         /* Allocate Space for the upper triangular matrix */
702         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
703         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
704 
705         /* Fill the upper triangular matrix */
706         AiUp[0]=(PetscInt) 0;
707         AiUp[n]=nzUpper;
708         offset = 0;
709         for (i=0; i<n; i++) {
710           /* set the pointers */
711           v  = aa + ai[i];
712           vj = aj + ai[i];
713           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
714 
715           /* first, set the diagonal elements */
716           AjUp[offset] = (PetscInt) i;
717           AAUp[offset] = (MatScalar)1.0/v[nz];
718           AiUp[i]      = offset;
719           AALo[offset] = (MatScalar)1.0/v[nz];
720 
721           offset+=1;
722           if (nz>0) {
723             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
724             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
725             for (j=offset; j<offset+nz; j++) {
726               AAUp[j] = -AAUp[j];
727               AALo[j] = AAUp[j]/v[nz];
728             }
729             offset+=nz;
730           }
731         }
732 
733         /* allocate space for the triangular factor information */
734         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
735         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
736 
737         /* Create the matrix description */
738         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
739         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
740        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
741         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
742        #else
743         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
744        #endif
745         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
746         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
747 
748         /* set the matrix */
749         upTriFactor->csrMat = new CsrMatrix;
750         upTriFactor->csrMat->num_rows = A->rmap->n;
751         upTriFactor->csrMat->num_cols = A->cmap->n;
752         upTriFactor->csrMat->num_entries = a->nz;
753 
754         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
755         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
756 
757         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
758         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
759 
760         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
761         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
762 
763         /* set the operation */
764         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
765 
766         /* Create the solve analysis information */
767         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
768         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
769       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
770         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
771                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
772                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
773                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
774                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
775         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
776       #endif
777 
778         /* perform the solve analysis */
779         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
780                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
781                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
782                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
783                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
784                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
785                                 #endif
786 );CHKERRCUSPARSE(stat);
787         cerr = WaitForCUDA();CHKERRCUDA(cerr);
788         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
789 
790         /* assign the pointer */
791         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
792 
793         /* allocate space for the triangular factor information */
794         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
795         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
796 
797         /* Create the matrix description */
798         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
799         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
800        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
801         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
802        #else
803         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
804        #endif
805         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
806         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
807 
808         /* set the operation */
809         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
810 
811         /* set the matrix */
812         loTriFactor->csrMat = new CsrMatrix;
813         loTriFactor->csrMat->num_rows = A->rmap->n;
814         loTriFactor->csrMat->num_cols = A->cmap->n;
815         loTriFactor->csrMat->num_entries = a->nz;
816 
817         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819 
820         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822 
823         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
825 
826         /* Create the solve analysis information */
827         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
828         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
829       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
830         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
831                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
832                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
833                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
834                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
835         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
836       #endif
837 
838         /* perform the solve analysis */
839         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
840                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
841                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
842                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
843                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
844                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
845                                 #endif
846 );CHKERRCUSPARSE(stat);
847         cerr = WaitForCUDA();CHKERRCUDA(cerr);
848         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
849 
850         /* assign the pointer */
851         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
852 
853         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
854         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
855         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
856       } else {
857         /* Fill the upper triangular matrix */
858         offset = 0;
859         for (i=0; i<n; i++) {
860           /* set the pointers */
861           v  = aa + ai[i];
862           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
863 
864           /* first, set the diagonal elements */
865           AAUp[offset] = 1.0/v[nz];
866           AALo[offset] = 1.0/v[nz];
867 
868           offset+=1;
869           if (nz>0) {
870             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
871             for (j=offset; j<offset+nz; j++) {
872               AAUp[j] = -AAUp[j];
873               AALo[j] = AAUp[j]/v[nz];
874             }
875             offset+=nz;
876           }
877         }
878         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
879         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
880         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
881         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
882         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
883       }
884       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
885       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
886     } catch(char *ex) {
887       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
888     }
889   }
890   PetscFunctionReturn(0);
891 }
892 
893 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
894 {
895   PetscErrorCode               ierr;
896   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
897   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
898   IS                           ip = a->row;
899   PetscBool                    perm_identity;
900   PetscInt                     n = A->rmap->n;
901 
902   PetscFunctionBegin;
903   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
904   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
905   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
906   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
907 
908   A->offloadmask = PETSC_OFFLOAD_BOTH;
909 
910   /* lower triangular indices */
911   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
912   if (!perm_identity) {
913     IS             iip;
914     const PetscInt *irip,*rip;
915 
916     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
917     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
918     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
919     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
920     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
921     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
922     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
923     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
924     ierr = ISDestroy(&iip);CHKERRQ(ierr);
925     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
926     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
927   }
928   PetscFunctionReturn(0);
929 }
930 
931 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
932 {
933   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
934   IS             isrow = b->row,iscol = b->col;
935   PetscBool      row_identity,col_identity;
936   PetscErrorCode ierr;
937 
938   PetscFunctionBegin;
939   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
940   B->offloadmask = PETSC_OFFLOAD_CPU;
941   /* determine which version of MatSolve needs to be used. */
942   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
943   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
944   if (row_identity && col_identity) {
945     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
946     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
947     B->ops->matsolve = NULL;
948     B->ops->matsolvetranspose = NULL;
949   } else {
950     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
951     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
952     B->ops->matsolve = NULL;
953     B->ops->matsolvetranspose = NULL;
954   }
955 
956   /* get the triangular factors */
957   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
958   PetscFunctionReturn(0);
959 }
960 
961 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
962 {
963   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
964   IS             ip = b->row;
965   PetscBool      perm_identity;
966   PetscErrorCode ierr;
967 
968   PetscFunctionBegin;
969   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970   B->offloadmask = PETSC_OFFLOAD_CPU;
971   /* determine which version of MatSolve needs to be used. */
972   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973   if (perm_identity) {
974     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
976     B->ops->matsolve = NULL;
977     B->ops->matsolvetranspose = NULL;
978   } else {
979     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
981     B->ops->matsolve = NULL;
982     B->ops->matsolvetranspose = NULL;
983   }
984 
985   /* get the triangular factors */
986   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987   PetscFunctionReturn(0);
988 }
989 
990 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991 {
992   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997   cusparseStatus_t                  stat;
998   cusparseIndexBase_t               indexBase;
999   cusparseMatrixType_t              matrixType;
1000   cusparseFillMode_t                fillMode;
1001   cusparseDiagType_t                diagType;
1002   cudaError_t                       cerr;
1003   PetscErrorCode                    ierr;
1004 
1005   PetscFunctionBegin;
1006   /* allocate space for the transpose of the lower triangular factor */
1007   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009 
1010   /* set the matrix descriptors of the lower triangular factor */
1011   matrixType = cusparseGetMatType(loTriFactor->descr);
1012   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016 
1017   /* Create the matrix description */
1018   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
1019   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1020   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1021   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1022   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023 
1024   /* set the operation */
1025   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026 
1027   /* allocate GPU space for the CSC of the lower triangular factor*/
1028   loTriFactorT->csrMat = new CsrMatrix;
1029   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035 
1036   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040                                        loTriFactor->csrMat->values->data().get(),
1041                                        loTriFactor->csrMat->row_offsets->data().get(),
1042                                        loTriFactor->csrMat->column_indices->data().get(),
1043                                        loTriFactorT->csrMat->values->data().get(),
1044                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1047   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048 #endif
1049 
1050   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053                           loTriFactor->csrMat->values->data().get(),
1054                           loTriFactor->csrMat->row_offsets->data().get(),
1055                           loTriFactor->csrMat->column_indices->data().get(),
1056                           loTriFactorT->csrMat->values->data().get(),
1057                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1061                         #else
1062                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063                           CUSPARSE_ACTION_NUMERIC, indexBase
1064                         #endif
1065 );CHKERRCUSPARSE(stat);
1066   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1067   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1068 
1069   /* Create the solve analysis information */
1070   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1071   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1072 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1073   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1074                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1075                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1076                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1077                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1078   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1079 #endif
1080 
1081   /* perform the solve analysis */
1082   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1083                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1084                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1085                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
1086                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1087                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1088                           #endif
1089 );CHKERRCUSPARSE(stat);
1090   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1091   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1092 
1093   /* assign the pointer */
1094   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1095 
1096   /*********************************************/
1097   /* Now the Transpose of the Upper Tri Factor */
1098   /*********************************************/
1099 
1100   /* allocate space for the transpose of the upper triangular factor */
1101   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1102   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1103 
1104   /* set the matrix descriptors of the upper triangular factor */
1105   matrixType = cusparseGetMatType(upTriFactor->descr);
1106   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1107   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1108     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1109   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1110 
1111   /* Create the matrix description */
1112   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
1113   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
1114   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
1115   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
1116   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1117 
1118   /* set the operation */
1119   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1120 
1121   /* allocate GPU space for the CSC of the upper triangular factor*/
1122   upTriFactorT->csrMat = new CsrMatrix;
1123   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1124   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1125   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1126   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1127   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1128   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1129 
1130   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1131 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1132   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1133                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1134                                 upTriFactor->csrMat->values->data().get(),
1135                                 upTriFactor->csrMat->row_offsets->data().get(),
1136                                 upTriFactor->csrMat->column_indices->data().get(),
1137                                 upTriFactorT->csrMat->values->data().get(),
1138                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1139                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1140                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1141   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1142 #endif
1143 
1144   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1145   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1146                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1147                           upTriFactor->csrMat->values->data().get(),
1148                           upTriFactor->csrMat->row_offsets->data().get(),
1149                           upTriFactor->csrMat->column_indices->data().get(),
1150                           upTriFactorT->csrMat->values->data().get(),
1151                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1152                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1153                           CUSPARSE_ACTION_NUMERIC, indexBase,
1154                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1155                         #else
1156                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1157                           CUSPARSE_ACTION_NUMERIC, indexBase
1158                         #endif
1159 );CHKERRCUSPARSE(stat);
1160   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1161   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1162 
1163   /* Create the solve analysis information */
1164   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1165   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1166   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1167   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1168                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1169                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1171                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1172   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1173   #endif
1174 
1175   /* perform the solve analysis */
1176   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1177                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1178                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1179                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1180                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1181                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1182                           #endif
1183 );CHKERRCUSPARSE(stat);
1184   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1185   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1186 
1187   /* assign the pointer */
1188   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1189   PetscFunctionReturn(0);
1190 }
1191 
1192 static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1193 {
1194   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1195   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1196   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1197   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198   cusparseStatus_t             stat;
1199   cusparseIndexBase_t          indexBase;
1200   cudaError_t                  err;
1201   PetscErrorCode               ierr;
1202 
1203   PetscFunctionBegin;
1204   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1205   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1206   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1207   /* create cusparse matrix */
1208   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
1209   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1210   indexBase = cusparseGetMatIndexBase(matstruct->descr);
1211   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
1212   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1213 
1214   /* set alpha and beta */
1215   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1216   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1217   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1218   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1219   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1220   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1221   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1222 
1223   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1224     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1225     CsrMatrix *matrixT= new CsrMatrix;
1226     matrixT->num_rows = A->cmap->n;
1227     matrixT->num_cols = A->rmap->n;
1228     matrixT->num_entries = a->nz;
1229     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1230     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1231     matrixT->values = new THRUSTARRAY(a->nz);
1232 
1233     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
1234     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1235 
1236     /* compute the transpose, i.e. the CSC */
1237    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1238     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1239                                   A->cmap->n, matrix->num_entries,
1240                                   matrix->values->data().get(),
1241                                   cusparsestruct->rowoffsets_gpu->data().get(),
1242                                   matrix->column_indices->data().get(),
1243                                   matrixT->values->data().get(),
1244                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1245                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1246                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1247     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1248    #endif
1249 
1250     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1251                             A->cmap->n, matrix->num_entries,
1252                             matrix->values->data().get(),
1253                             cusparsestruct->rowoffsets_gpu->data().get(),
1254                             matrix->column_indices->data().get(),
1255                             matrixT->values->data().get(),
1256                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1258                             CUSPARSE_ACTION_NUMERIC,indexBase,
1259                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1260                           #else
1261                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1262                             CUSPARSE_ACTION_NUMERIC, indexBase
1263                           #endif
1264 );CHKERRCUSPARSE(stat);
1265     matstructT->mat = matrixT;
1266 
1267    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268     stat = cusparseCreateCsr(&matstructT->matDescr,
1269                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1270                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1271                              matrixT->values->data().get(),
1272                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1273                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1274    #endif
1275   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1276    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1277     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1278    #else
1279     CsrMatrix *temp  = new CsrMatrix;
1280     CsrMatrix *tempT = new CsrMatrix;
1281     /* First convert HYB to CSR */
1282     temp->num_rows = A->rmap->n;
1283     temp->num_cols = A->cmap->n;
1284     temp->num_entries = a->nz;
1285     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1286     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1287     temp->values = new THRUSTARRAY(a->nz);
1288 
1289     stat = cusparse_hyb2csr(cusparsestruct->handle,
1290                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1291                             temp->values->data().get(),
1292                             temp->row_offsets->data().get(),
1293                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1294 
1295     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296     tempT->num_rows = A->rmap->n;
1297     tempT->num_cols = A->cmap->n;
1298     tempT->num_entries = a->nz;
1299     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1300     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301     tempT->values = new THRUSTARRAY(a->nz);
1302 
1303     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1304                             temp->num_cols, temp->num_entries,
1305                             temp->values->data().get(),
1306                             temp->row_offsets->data().get(),
1307                             temp->column_indices->data().get(),
1308                             tempT->values->data().get(),
1309                             tempT->column_indices->data().get(),
1310                             tempT->row_offsets->data().get(),
1311                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1312 
1313     /* Last, convert CSC to HYB */
1314     cusparseHybMat_t hybMat;
1315     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1316     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1317       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1319                             matstructT->descr, tempT->values->data().get(),
1320                             tempT->row_offsets->data().get(),
1321                             tempT->column_indices->data().get(),
1322                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1323 
1324     /* assign the pointer */
1325     matstructT->mat = hybMat;
1326     /* delete temporaries */
1327     if (tempT) {
1328       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1329       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1330       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1331       delete (CsrMatrix*) tempT;
1332     }
1333     if (temp) {
1334       if (temp->values) delete (THRUSTARRAY*) temp->values;
1335       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1336       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1337       delete (CsrMatrix*) temp;
1338     }
1339    #endif
1340   }
1341   err  = WaitForCUDA();CHKERRCUDA(err);
1342   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1343   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1344   /* the compressed row indices is not used for matTranspose */
1345   matstructT->cprowIndices = NULL;
1346   /* assign the pointer */
1347   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1352 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1353 {
1354   PetscInt                              n = xx->map->n;
1355   const PetscScalar                     *barray;
1356   PetscScalar                           *xarray;
1357   thrust::device_ptr<const PetscScalar> bGPU;
1358   thrust::device_ptr<PetscScalar>       xGPU;
1359   cusparseStatus_t                      stat;
1360   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1361   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1362   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1363   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1364   PetscErrorCode                        ierr;
1365   cudaError_t                           cerr;
1366 
1367   PetscFunctionBegin;
1368   /* Analyze the matrix and create the transpose ... on the fly */
1369   if (!loTriFactorT && !upTriFactorT) {
1370     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1371     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1372     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1373   }
1374 
1375   /* Get the GPU pointers */
1376   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1377   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1378   xGPU = thrust::device_pointer_cast(xarray);
1379   bGPU = thrust::device_pointer_cast(barray);
1380 
1381   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1382   /* First, reorder with the row permutation */
1383   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1384                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1385                xGPU);
1386 
1387   /* First, solve U */
1388   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1389                         upTriFactorT->csrMat->num_rows,
1390                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1391                         upTriFactorT->csrMat->num_entries,
1392                       #endif
1393                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1394                         upTriFactorT->csrMat->values->data().get(),
1395                         upTriFactorT->csrMat->row_offsets->data().get(),
1396                         upTriFactorT->csrMat->column_indices->data().get(),
1397                         upTriFactorT->solveInfo,
1398                         xarray, tempGPU->data().get()
1399                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1401                       #endif
1402 );CHKERRCUSPARSE(stat);
1403 
1404   /* Then, solve L */
1405   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1406                         loTriFactorT->csrMat->num_rows,
1407                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408                         loTriFactorT->csrMat->num_entries,
1409                       #endif
1410                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1411                         loTriFactorT->csrMat->values->data().get(),
1412                         loTriFactorT->csrMat->row_offsets->data().get(),
1413                         loTriFactorT->csrMat->column_indices->data().get(),
1414                         loTriFactorT->solveInfo,
1415                         tempGPU->data().get(), xarray
1416                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1417                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1418                       #endif
1419 );CHKERRCUSPARSE(stat);
1420 
1421   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1422   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1423                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1424                tempGPU->begin());
1425 
1426   /* Copy the temporary to the full solution. */
1427   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1428 
1429   /* restore */
1430   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1431   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1432   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1433   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1434   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1435   PetscFunctionReturn(0);
1436 }
1437 
1438 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1439 {
1440   const PetscScalar                 *barray;
1441   PetscScalar                       *xarray;
1442   cusparseStatus_t                  stat;
1443   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1444   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1445   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1446   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1447   PetscErrorCode                    ierr;
1448   cudaError_t                       cerr;
1449 
1450   PetscFunctionBegin;
1451   /* Analyze the matrix and create the transpose ... on the fly */
1452   if (!loTriFactorT && !upTriFactorT) {
1453     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1454     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1455     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1456   }
1457 
1458   /* Get the GPU pointers */
1459   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1460   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1461 
1462   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1463   /* First, solve U */
1464   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1465                         upTriFactorT->csrMat->num_rows,
1466                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467                         upTriFactorT->csrMat->num_entries,
1468                       #endif
1469                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1470                         upTriFactorT->csrMat->values->data().get(),
1471                         upTriFactorT->csrMat->row_offsets->data().get(),
1472                         upTriFactorT->csrMat->column_indices->data().get(),
1473                         upTriFactorT->solveInfo,
1474                         barray, tempGPU->data().get()
1475                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1477                       #endif
1478 );CHKERRCUSPARSE(stat);
1479 
1480   /* Then, solve L */
1481   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482                         loTriFactorT->csrMat->num_rows,
1483                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484                         loTriFactorT->csrMat->num_entries,
1485                       #endif
1486                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487                         loTriFactorT->csrMat->values->data().get(),
1488                         loTriFactorT->csrMat->row_offsets->data().get(),
1489                         loTriFactorT->csrMat->column_indices->data().get(),
1490                         loTriFactorT->solveInfo,
1491                         tempGPU->data().get(), xarray
1492                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1494                       #endif
1495 );CHKERRCUSPARSE(stat);
1496 
1497   /* restore */
1498   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1499   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1500   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1501   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1502   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1507 {
1508   const PetscScalar                     *barray;
1509   PetscScalar                           *xarray;
1510   thrust::device_ptr<const PetscScalar> bGPU;
1511   thrust::device_ptr<PetscScalar>       xGPU;
1512   cusparseStatus_t                      stat;
1513   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1514   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1515   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1516   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1517   PetscErrorCode                        ierr;
1518   cudaError_t                           cerr;
1519 
1520   PetscFunctionBegin;
1521 
1522   /* Get the GPU pointers */
1523   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525   xGPU = thrust::device_pointer_cast(xarray);
1526   bGPU = thrust::device_pointer_cast(barray);
1527 
1528   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1529   /* First, reorder with the row permutation */
1530   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1531                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
1532                tempGPU->begin());
1533 
1534   /* Next, solve L */
1535   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1536                         loTriFactor->csrMat->num_rows,
1537                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1538                         loTriFactor->csrMat->num_entries,
1539                       #endif
1540                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1541                         loTriFactor->csrMat->values->data().get(),
1542                         loTriFactor->csrMat->row_offsets->data().get(),
1543                         loTriFactor->csrMat->column_indices->data().get(),
1544                         loTriFactor->solveInfo,
1545                         tempGPU->data().get(), xarray
1546                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1548                       #endif
1549 );CHKERRCUSPARSE(stat);
1550 
1551   /* Then, solve U */
1552   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1553                         upTriFactor->csrMat->num_rows,
1554                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555                         upTriFactor->csrMat->num_entries,
1556                       #endif
1557                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1558                         upTriFactor->csrMat->values->data().get(),
1559                         upTriFactor->csrMat->row_offsets->data().get(),
1560                         upTriFactor->csrMat->column_indices->data().get(),
1561                         upTriFactor->solveInfo,
1562                         xarray, tempGPU->data().get()
1563                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1565                       #endif
1566 );CHKERRCUSPARSE(stat);
1567 
1568   /* Last, reorder with the column permutation */
1569   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
1570                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
1571                xGPU);
1572 
1573   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1574   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1575   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1576   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1577   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1578   PetscFunctionReturn(0);
1579 }
1580 
1581 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1582 {
1583   const PetscScalar                 *barray;
1584   PetscScalar                       *xarray;
1585   cusparseStatus_t                  stat;
1586   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1587   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1588   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1589   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1590   PetscErrorCode                    ierr;
1591   cudaError_t                       cerr;
1592 
1593   PetscFunctionBegin;
1594   /* Get the GPU pointers */
1595   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1596   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1597 
1598   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1599   /* First, solve L */
1600   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1601                         loTriFactor->csrMat->num_rows,
1602                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1603                         loTriFactor->csrMat->num_entries,
1604                       #endif
1605                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1606                         loTriFactor->csrMat->values->data().get(),
1607                         loTriFactor->csrMat->row_offsets->data().get(),
1608                         loTriFactor->csrMat->column_indices->data().get(),
1609                         loTriFactor->solveInfo,
1610                         barray, tempGPU->data().get()
1611                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1613                       #endif
1614 );CHKERRCUSPARSE(stat);
1615 
1616   /* Next, solve U */
1617   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1618                         upTriFactor->csrMat->num_rows,
1619                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620                         upTriFactor->csrMat->num_entries,
1621                       #endif
1622                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1623                         upTriFactor->csrMat->values->data().get(),
1624                         upTriFactor->csrMat->row_offsets->data().get(),
1625                         upTriFactor->csrMat->column_indices->data().get(),
1626                         upTriFactor->solveInfo,
1627                         tempGPU->data().get(), xarray
1628                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1629                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1630                       #endif
1631 );CHKERRCUSPARSE(stat);
1632 
1633   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1634   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1635   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1636   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1637   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1638   PetscFunctionReturn(0);
1639 }
1640 
1641 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1642 {
1643   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1644   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1645   cudaError_t        cerr;
1646   PetscErrorCode     ierr;
1647 
1648   PetscFunctionBegin;
1649   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1650     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1651 
1652     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1653     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1654     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1655     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1656     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1657     A->offloadmask = PETSC_OFFLOAD_BOTH;
1658   }
1659   PetscFunctionReturn(0);
1660 }
1661 
1662 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1663 {
1664   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1665   PetscErrorCode ierr;
1666 
1667   PetscFunctionBegin;
1668   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1669   *array = a->a;
1670   A->offloadmask = PETSC_OFFLOAD_CPU;
1671   PetscFunctionReturn(0);
1672 }
1673 
1674 static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
1675 {
1676   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1677   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
1678   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1679   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1680   PetscErrorCode               ierr;
1681   cusparseStatus_t             stat;
1682   cudaError_t                  err;
1683 
1684   PetscFunctionBegin;
1685   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1686   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1687     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
1688       /* Copy values only */
1689       CsrMatrix *matrix,*matrixT;
1690       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
1691 
1692       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1693       matrix->values->assign(a->a, a->a+a->nz);
1694       err  = WaitForCUDA();CHKERRCUDA(err);
1695       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
1696       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1697 
1698       /* Update matT when it was built before */
1699       if (cusparsestruct->matTranspose) {
1700         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1701         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
1702         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1703         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1704                             A->cmap->n, matrix->num_entries,
1705                             matrix->values->data().get(),
1706                             cusparsestruct->rowoffsets_gpu->data().get(),
1707                             matrix->column_indices->data().get(),
1708                             matrixT->values->data().get(),
1709                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1710                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1711                             CUSPARSE_ACTION_NUMERIC,indexBase,
1712                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1713                           #else
1714                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1715                             CUSPARSE_ACTION_NUMERIC, indexBase
1716                           #endif
1717 );CHKERRCUSPARSE(stat);
1718         err  = WaitForCUDA();CHKERRCUDA(err);
1719         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1720       }
1721     } else {
1722       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1723       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1724       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
1725       delete cusparsestruct->workVector;
1726       delete cusparsestruct->rowoffsets_gpu;
1727       try {
1728         if (a->compressedrow.use) {
1729           m    = a->compressedrow.nrows;
1730           ii   = a->compressedrow.i;
1731           ridx = a->compressedrow.rindex;
1732         } else {
1733           m    = A->rmap->n;
1734           ii   = a->i;
1735           ridx = NULL;
1736         }
1737         cusparsestruct->nrows = m;
1738 
1739         /* create cusparse matrix */
1740         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
1741         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
1742         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
1743         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1744 
1745         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
1746         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
1747         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1748         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1749         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1750         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1751         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1752 
1753         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1754         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1755           /* set the matrix */
1756           CsrMatrix *mat= new CsrMatrix;
1757           mat->num_rows = m;
1758           mat->num_cols = A->cmap->n;
1759           mat->num_entries = a->nz;
1760           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1761           mat->row_offsets->assign(ii, ii + m+1);
1762 
1763           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1764           mat->column_indices->assign(a->j, a->j+a->nz);
1765 
1766           mat->values = new THRUSTARRAY(a->nz);
1767           mat->values->assign(a->a, a->a+a->nz);
1768 
1769           /* assign the pointer */
1770           matstruct->mat = mat;
1771          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1772           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1773             stat = cusparseCreateCsr(&matstruct->matDescr,
1774                                     mat->num_rows, mat->num_cols, mat->num_entries,
1775                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1776                                     mat->values->data().get(),
1777                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1778                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1779           }
1780          #endif
1781         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1782          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1783           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1784          #else
1785           CsrMatrix *mat= new CsrMatrix;
1786           mat->num_rows = m;
1787           mat->num_cols = A->cmap->n;
1788           mat->num_entries = a->nz;
1789           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790           mat->row_offsets->assign(ii, ii + m+1);
1791 
1792           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1793           mat->column_indices->assign(a->j, a->j+a->nz);
1794 
1795           mat->values = new THRUSTARRAY(a->nz);
1796           mat->values->assign(a->a, a->a+a->nz);
1797 
1798           cusparseHybMat_t hybMat;
1799           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1800           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1801             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1802           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1803               matstruct->descr, mat->values->data().get(),
1804               mat->row_offsets->data().get(),
1805               mat->column_indices->data().get(),
1806               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1807           /* assign the pointer */
1808           matstruct->mat = hybMat;
1809 
1810           if (mat) {
1811             if (mat->values) delete (THRUSTARRAY*)mat->values;
1812             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1813             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1814             delete (CsrMatrix*)mat;
1815           }
1816          #endif
1817         }
1818 
1819         /* assign the compressed row indices */
1820         if (a->compressedrow.use) {
1821           cusparsestruct->workVector = new THRUSTARRAY(m);
1822           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1823           matstruct->cprowIndices->assign(ridx,ridx+m);
1824           tmp = m;
1825         } else {
1826           cusparsestruct->workVector = NULL;
1827           matstruct->cprowIndices    = NULL;
1828           tmp = 0;
1829         }
1830         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1831 
1832         /* assign the pointer */
1833         cusparsestruct->mat = matstruct;
1834       } catch(char *ex) {
1835         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
1836       }
1837       err  = WaitForCUDA();CHKERRCUDA(err);
1838       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1839       cusparsestruct->nonzerostate = A->nonzerostate;
1840     }
1841     A->offloadmask = PETSC_OFFLOAD_BOTH;
1842   }
1843   PetscFunctionReturn(0);
1844 }
1845 
1846 struct VecCUDAPlusEquals
1847 {
1848   template <typename Tuple>
1849   __host__ __device__
1850   void operator()(Tuple t)
1851   {
1852     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1853   }
1854 };
1855 
1856 struct VecCUDAEquals
1857 {
1858   template <typename Tuple>
1859   __host__ __device__
1860   void operator()(Tuple t)
1861   {
1862     thrust::get<1>(t) = thrust::get<0>(t);
1863   }
1864 };
1865 
1866 struct VecCUDAEqualsReverse
1867 {
1868   template <typename Tuple>
1869   __host__ __device__
1870   void operator()(Tuple t)
1871   {
1872     thrust::get<0>(t) = thrust::get<1>(t);
1873   }
1874 };
1875 
1876 struct MatMatCusparse {
1877   PetscBool             cisdense;
1878   PetscScalar           *Bt;
1879   Mat                   X;
1880   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1881   PetscLogDouble        flops;
1882   CsrMatrix             *Bcsr;
1883 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1884   cusparseSpMatDescr_t  matSpBDescr;
1885   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1886   cusparseDnMatDescr_t  matBDescr;
1887   cusparseDnMatDescr_t  matCDescr;
1888   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1889   size_t                mmBufferSize;
1890   void                  *mmBuffer;
1891   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1892   cusparseSpGEMMDescr_t spgemmDesc;
1893 #endif
1894 };
1895 
1896 static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1897 {
1898   PetscErrorCode   ierr;
1899   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1900   cudaError_t      cerr;
1901  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1902   cusparseStatus_t stat;
1903  #endif
1904 
1905   PetscFunctionBegin;
1906   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1907   delete mmdata->Bcsr;
1908  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1909   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1910   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1911   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1912   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1913   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1914   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1915  #endif
1916   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1917   ierr = PetscFree(data);CHKERRQ(ierr);
1918   PetscFunctionReturn(0);
1919 }
1920 
1921 PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1922 
1923 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1924 {
1925   Mat_Product                  *product = C->product;
1926   Mat                          A,B;
1927   PetscInt                     m,n,blda,clda;
1928   PetscBool                    flg,biscuda;
1929   Mat_SeqAIJCUSPARSE           *cusp;
1930   cusparseStatus_t             stat;
1931   cusparseOperation_t          opA;
1932   const PetscScalar            *barray;
1933   PetscScalar                  *carray;
1934   PetscErrorCode               ierr;
1935   MatMatCusparse               *mmdata;
1936   Mat_SeqAIJCUSPARSEMultStruct *mat;
1937   CsrMatrix                    *csrmat;
1938   cudaError_t                  cerr;
1939 
1940   PetscFunctionBegin;
1941   MatCheckProduct(C,1);
1942   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1943   mmdata = (MatMatCusparse*)product->data;
1944   A    = product->A;
1945   B    = product->B;
1946   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1947   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1948   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1949      Instead of silently accepting the wrong answer, I prefer to raise the error */
1950   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1951   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1952   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1953   switch (product->type) {
1954   case MATPRODUCT_AB:
1955   case MATPRODUCT_PtAP:
1956     mat = cusp->mat;
1957     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1958     m   = A->rmap->n;
1959     n   = B->cmap->n;
1960     break;
1961   case MATPRODUCT_AtB:
1962     if (!cusp->transgen) {
1963       mat = cusp->mat;
1964       opA = CUSPARSE_OPERATION_TRANSPOSE;
1965     } else {
1966       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1967       mat  = cusp->matTranspose;
1968       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1969     }
1970     m = A->cmap->n;
1971     n = B->cmap->n;
1972     break;
1973   case MATPRODUCT_ABt:
1974   case MATPRODUCT_RARt:
1975     mat = cusp->mat;
1976     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1977     m   = A->rmap->n;
1978     n   = B->rmap->n;
1979     break;
1980   default:
1981     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1982   }
1983   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1984   csrmat = (CsrMatrix*)mat->mat;
1985   /* if the user passed a CPU matrix, copy the data to the GPU */
1986   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1987   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1988   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1989 
1990   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1991   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1992     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1993     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1994   } else {
1995     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1996     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1997   }
1998 
1999   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2000  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2001   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2002   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2003   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2004     size_t mmBufferSize;
2005     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2006     if (!mmdata->matBDescr) {
2007       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2008       mmdata->Blda = blda;
2009     }
2010 
2011     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2012     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2013       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2014       mmdata->Clda = clda;
2015     }
2016 
2017     if (!mat->matDescr) {
2018       stat = cusparseCreateCsr(&mat->matDescr,
2019                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2020                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2021                                csrmat->values->data().get(),
2022                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2023                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2024     }
2025     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2026                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2027                                    mmdata->matCDescr,cusparse_scalartype,
2028                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2029     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2030       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2031       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2032       mmdata->mmBufferSize = mmBufferSize;
2033     }
2034     mmdata->initialized = PETSC_TRUE;
2035   } else {
2036     /* to be safe, always update pointers of the mats */
2037     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2038     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2039     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2040   }
2041 
2042   /* do cusparseSpMM, which supports transpose on B */
2043   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2044                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2045                       mmdata->matCDescr,cusparse_scalartype,
2046                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2047  #else
2048   PetscInt k;
2049   /* cusparseXcsrmm does not support transpose on B */
2050   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2051     cublasHandle_t cublasv2handle;
2052     cublasStatus_t cerr;
2053 
2054     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2055     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2056                        B->cmap->n,B->rmap->n,
2057                        &PETSC_CUSPARSE_ONE ,barray,blda,
2058                        &PETSC_CUSPARSE_ZERO,barray,blda,
2059                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2060     blda = B->cmap->n;
2061     k    = B->cmap->n;
2062   } else {
2063     k    = B->rmap->n;
2064   }
2065 
2066   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2067   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2068                            csrmat->num_entries,mat->alpha_one,mat->descr,
2069                            csrmat->values->data().get(),
2070                            csrmat->row_offsets->data().get(),
2071                            csrmat->column_indices->data().get(),
2072                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2073                            carray,clda);CHKERRCUSPARSE(stat);
2074  #endif
2075   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2076   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2077   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2078   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2079   if (product->type == MATPRODUCT_RARt) {
2080     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2081     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2082   } else if (product->type == MATPRODUCT_PtAP) {
2083     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2084     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2085   } else {
2086     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2087   }
2088   if (mmdata->cisdense) {
2089     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2090   }
2091   if (!biscuda) {
2092     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2093   }
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2098 {
2099   Mat_Product        *product = C->product;
2100   Mat                A,B;
2101   PetscInt           m,n;
2102   PetscBool          cisdense,flg;
2103   PetscErrorCode     ierr;
2104   MatMatCusparse     *mmdata;
2105   Mat_SeqAIJCUSPARSE *cusp;
2106 
2107   PetscFunctionBegin;
2108   MatCheckProduct(C,1);
2109   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2110   A    = product->A;
2111   B    = product->B;
2112   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2113   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2114   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2115   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2116   switch (product->type) {
2117   case MATPRODUCT_AB:
2118     m = A->rmap->n;
2119     n = B->cmap->n;
2120     break;
2121   case MATPRODUCT_AtB:
2122     m = A->cmap->n;
2123     n = B->cmap->n;
2124     break;
2125   case MATPRODUCT_ABt:
2126     m = A->rmap->n;
2127     n = B->rmap->n;
2128     break;
2129   case MATPRODUCT_PtAP:
2130     m = B->cmap->n;
2131     n = B->cmap->n;
2132     break;
2133   case MATPRODUCT_RARt:
2134     m = B->rmap->n;
2135     n = B->rmap->n;
2136     break;
2137   default:
2138     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2139   }
2140   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2141   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2142   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2143   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2144 
2145   /* product data */
2146   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2147   mmdata->cisdense = cisdense;
2148  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2149   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2150   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2151     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2152   }
2153  #endif
2154   /* for these products we need intermediate storage */
2155   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2156     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2157     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2158     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2159       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2160     } else {
2161       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2162     }
2163   }
2164   C->product->data    = mmdata;
2165   C->product->destroy = MatDestroy_MatMatCusparse;
2166 
2167   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2168   PetscFunctionReturn(0);
2169 }
2170 
2171 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2172 {
2173   Mat_Product                  *product = C->product;
2174   Mat                          A,B;
2175   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2176   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2177   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2178   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2179   PetscBool                    flg;
2180   PetscErrorCode               ierr;
2181   cusparseStatus_t             stat;
2182   cudaError_t                  cerr;
2183   MatProductType               ptype;
2184   MatMatCusparse               *mmdata;
2185 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2186   cusparseSpMatDescr_t         BmatSpDescr;
2187 #endif
2188 
2189   PetscFunctionBegin;
2190   MatCheckProduct(C,1);
2191   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2192   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2193   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2194   mmdata = (MatMatCusparse*)C->product->data;
2195   A = product->A;
2196   B = product->B;
2197   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2198     mmdata->reusesym = PETSC_FALSE;
2199     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2200     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2201     Cmat = Ccusp->mat;
2202     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2203     Ccsr = (CsrMatrix*)Cmat->mat;
2204     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2205     goto finalize;
2206   }
2207   if (!c->nz) goto finalize;
2208   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2209   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2210   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2211   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2212   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2213   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2214   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2215   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2216   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2217   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2218   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2219   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2220   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2221   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2222 
2223   ptype = product->type;
2224   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2225   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2226   switch (ptype) {
2227   case MATPRODUCT_AB:
2228     Amat = Acusp->mat;
2229     Bmat = Bcusp->mat;
2230     break;
2231   case MATPRODUCT_AtB:
2232     Amat = Acusp->matTranspose;
2233     Bmat = Bcusp->mat;
2234     break;
2235   case MATPRODUCT_ABt:
2236     Amat = Acusp->mat;
2237     Bmat = Bcusp->matTranspose;
2238     break;
2239   default:
2240     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2241   }
2242   Cmat = Ccusp->mat;
2243   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2244   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2245   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2246   Acsr = (CsrMatrix*)Amat->mat;
2247   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2248   Ccsr = (CsrMatrix*)Cmat->mat;
2249   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2250   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2251   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2252   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2253 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2254   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2255   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2256                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2257                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2258                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2259   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2260                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2261                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2262 #else
2263   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2264                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2265                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2266                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2267                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2268 #endif
2269   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2270   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2271   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2272   C->offloadmask = PETSC_OFFLOAD_GPU;
2273 finalize:
2274   /* shorter version of MatAssemblyEnd_SeqAIJ */
2275   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2276   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2277   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2278   c->reallocs         = 0;
2279   C->info.mallocs    += 0;
2280   C->info.nz_unneeded = 0;
2281   C->assembled = C->was_assembled = PETSC_TRUE;
2282   C->num_ass++;
2283   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2284   ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr);
2285   PetscFunctionReturn(0);
2286 }
2287 
2288 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2289 {
2290   Mat_Product                  *product = C->product;
2291   Mat                          A,B;
2292   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2293   Mat_SeqAIJ                   *a,*b,*c;
2294   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2295   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2296   PetscInt                     i,j,m,n,k;
2297   PetscBool                    flg;
2298   PetscErrorCode               ierr;
2299   cusparseStatus_t             stat;
2300   cudaError_t                  cerr;
2301   MatProductType               ptype;
2302   MatMatCusparse               *mmdata;
2303   PetscLogDouble               flops;
2304   PetscBool                    biscompressed,ciscompressed;
2305 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2307   size_t                       bufSize2;
2308   cusparseSpMatDescr_t         BmatSpDescr;
2309 #else
2310   int                          cnz;
2311 #endif
2312 
2313   PetscFunctionBegin;
2314   MatCheckProduct(C,1);
2315   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2316   A    = product->A;
2317   B    = product->B;
2318   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2319   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2320   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2321   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2322   a = (Mat_SeqAIJ*)A->data;
2323   b = (Mat_SeqAIJ*)B->data;
2324   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2325   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2326   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2327   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2328 
2329   /* product data */
2330   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2331   C->product->data    = mmdata;
2332   C->product->destroy = MatDestroy_MatMatCusparse;
2333 
2334   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2335   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2336   ptype = product->type;
2337   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2338   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2339   biscompressed = PETSC_FALSE;
2340   ciscompressed = PETSC_FALSE;
2341   switch (ptype) {
2342   case MATPRODUCT_AB:
2343     m = A->rmap->n;
2344     n = B->cmap->n;
2345     k = A->cmap->n;
2346     Amat = Acusp->mat;
2347     Bmat = Bcusp->mat;
2348     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2349     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2350     break;
2351   case MATPRODUCT_AtB:
2352     m = A->cmap->n;
2353     n = B->cmap->n;
2354     k = A->rmap->n;
2355     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2356     Amat = Acusp->matTranspose;
2357     Bmat = Bcusp->mat;
2358     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2359     break;
2360   case MATPRODUCT_ABt:
2361     m = A->rmap->n;
2362     n = B->rmap->n;
2363     k = A->cmap->n;
2364     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2365     Amat = Acusp->mat;
2366     Bmat = Bcusp->matTranspose;
2367     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2368     break;
2369   default:
2370     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2371   }
2372 
2373   /* create cusparse matrix */
2374   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2375   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2376   c     = (Mat_SeqAIJ*)C->data;
2377   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2378   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2379   Ccsr  = new CsrMatrix;
2380 
2381   c->compressedrow.use = ciscompressed;
2382   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2383     c->compressedrow.nrows = a->compressedrow.nrows;
2384     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2385     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2386     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2387     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2388     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2389   } else {
2390     c->compressedrow.nrows  = 0;
2391     c->compressedrow.i      = NULL;
2392     c->compressedrow.rindex = NULL;
2393     Ccusp->workVector       = NULL;
2394     Cmat->cprowIndices      = NULL;
2395   }
2396   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2397   Ccusp->mat      = Cmat;
2398   Ccusp->mat->mat = Ccsr;
2399   Ccsr->num_rows    = Ccusp->nrows;
2400   Ccsr->num_cols    = n;
2401   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2402   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2403   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2404   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2405   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2406   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2407   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2408   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2409   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2410   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2411   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2412     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2413     c->nz = 0;
2414     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2415     Ccsr->values = new THRUSTARRAY(c->nz);
2416     goto finalizesym;
2417   }
2418 
2419   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2420   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2421   Acsr = (CsrMatrix*)Amat->mat;
2422   if (!biscompressed) {
2423     Bcsr = (CsrMatrix*)Bmat->mat;
2424 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2425     BmatSpDescr = Bmat->matDescr;
2426 #endif
2427   } else { /* we need to use row offsets for the full matrix */
2428     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2429     Bcsr = new CsrMatrix;
2430     Bcsr->num_rows       = B->rmap->n;
2431     Bcsr->num_cols       = cBcsr->num_cols;
2432     Bcsr->num_entries    = cBcsr->num_entries;
2433     Bcsr->column_indices = cBcsr->column_indices;
2434     Bcsr->values         = cBcsr->values;
2435     if (!Bcusp->rowoffsets_gpu) {
2436       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2437       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2438       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2439     }
2440     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2441     mmdata->Bcsr = Bcsr;
2442 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2443     if (Bcsr->num_rows && Bcsr->num_cols) {
2444       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2445                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2446                                Bcsr->values->data().get(),
2447                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2448                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2449     }
2450     BmatSpDescr = mmdata->matSpBDescr;
2451 #endif
2452   }
2453   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2454   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2455   /* precompute flops count */
2456   if (ptype == MATPRODUCT_AB) {
2457     for (i=0, flops = 0; i<A->rmap->n; i++) {
2458       const PetscInt st = a->i[i];
2459       const PetscInt en = a->i[i+1];
2460       for (j=st; j<en; j++) {
2461         const PetscInt brow = a->j[j];
2462         flops += 2.*(b->i[brow+1] - b->i[brow]);
2463       }
2464     }
2465   } else if (ptype == MATPRODUCT_AtB) {
2466     for (i=0, flops = 0; i<A->rmap->n; i++) {
2467       const PetscInt anzi = a->i[i+1] - a->i[i];
2468       const PetscInt bnzi = b->i[i+1] - b->i[i];
2469       flops += (2.*anzi)*bnzi;
2470     }
2471   } else { /* TODO */
2472     flops = 0.;
2473   }
2474 
2475   mmdata->flops = flops;
2476   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2477 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2478   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2479   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2480                            NULL, NULL, NULL,
2481                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2482                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2483   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2484   /* ask bufferSize bytes for external memory */
2485   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2486                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2487                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2488                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2489   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat);
2490   /* inspect the matrices A and B to understand the memory requirement for the next step */
2491   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2492                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2493                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2494                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2495   /* ask bufferSize again bytes for external memory */
2496   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2497                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2498                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2499                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2500   /* The CUSPARSE documentation is not clear, nor the API
2501      We need both buffers to perform the operations properly!
2502      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2503      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2504      is stored in the descriptor! What a messy API... */
2505   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat);
2506   /* compute the intermediate product of A * B */
2507   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2508                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2509                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2510                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2511   /* get matrix C non-zero entries C_nnz1 */
2512   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2513   c->nz = (PetscInt) C_nnz1;
2514   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2515   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2516   Ccsr->values = new THRUSTARRAY(c->nz);
2517   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2518   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2519                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2520   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2521                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2522                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2523 #else
2524   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2525   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2526                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2527                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2528                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2529                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2530   c->nz = cnz;
2531   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2532   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2533   Ccsr->values = new THRUSTARRAY(c->nz);
2534   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2535 
2536   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2537   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2538      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2539      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2540   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2541                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2542                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2543                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2544                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2545 #endif
2546   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2547   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2548   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2549 finalizesym:
2550   c->singlemalloc = PETSC_FALSE;
2551   c->free_a       = PETSC_TRUE;
2552   c->free_ij      = PETSC_TRUE;
2553   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2554   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2555   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2556     PetscInt *d_i = c->i;
2557     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2558     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2559     ii   = *Ccsr->row_offsets;
2560     jj   = *Ccsr->column_indices;
2561     if (ciscompressed) d_i = c->compressedrow.i;
2562     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2563     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2564   } else {
2565     PetscInt *d_i = c->i;
2566     if (ciscompressed) d_i = c->compressedrow.i;
2567     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2568     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2569   }
2570   if (ciscompressed) { /* need to expand host row offsets */
2571     PetscInt r = 0;
2572     c->i[0] = 0;
2573     for (k = 0; k < c->compressedrow.nrows; k++) {
2574       const PetscInt next = c->compressedrow.rindex[k];
2575       const PetscInt old = c->compressedrow.i[k];
2576       for (; r < next; r++) c->i[r+1] = old;
2577     }
2578     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2579   }
2580   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2581   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2582   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2583   c->maxnz = c->nz;
2584   c->nonzerorowcnt = 0;
2585   c->rmax = 0;
2586   for (k = 0; k < m; k++) {
2587     const PetscInt nn = c->i[k+1] - c->i[k];
2588     c->ilen[k] = c->imax[k] = nn;
2589     c->nonzerorowcnt += (PetscInt)!!nn;
2590     c->rmax = PetscMax(c->rmax,nn);
2591   }
2592   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2593   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2594   Ccsr->num_entries = c->nz;
2595 
2596   C->nonzerostate++;
2597   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2598   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2599   Ccusp->nonzerostate = C->nonzerostate;
2600   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2601   C->preallocated  = PETSC_TRUE;
2602   C->assembled     = PETSC_FALSE;
2603   C->was_assembled = PETSC_FALSE;
2604   if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2605     mmdata->reusesym = PETSC_TRUE;
2606     C->offloadmask   = PETSC_OFFLOAD_GPU;
2607   }
2608   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2609   PetscFunctionReturn(0);
2610 }
2611 
2612 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2613 
2614 /* handles sparse or dense B */
2615 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2616 {
2617   Mat_Product    *product = mat->product;
2618   PetscErrorCode ierr;
2619   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2620 
2621   PetscFunctionBegin;
2622   MatCheckProduct(mat,1);
2623   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2624   if (!product->B->boundtocpu) {
2625     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2626   }
2627   if (product->type == MATPRODUCT_ABC) {
2628     Ciscusp = PETSC_FALSE;
2629     if (!product->C->boundtocpu) {
2630       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2631     }
2632   }
2633   if (isdense) {
2634     switch (product->type) {
2635     case MATPRODUCT_AB:
2636     case MATPRODUCT_AtB:
2637     case MATPRODUCT_ABt:
2638     case MATPRODUCT_PtAP:
2639     case MATPRODUCT_RARt:
2640      if (product->A->boundtocpu) {
2641         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2642       } else {
2643         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2644       }
2645       break;
2646     case MATPRODUCT_ABC:
2647       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2648       break;
2649     default:
2650       break;
2651     }
2652   } else if (Biscusp && Ciscusp) {
2653     switch (product->type) {
2654     case MATPRODUCT_AB:
2655     case MATPRODUCT_AtB:
2656     case MATPRODUCT_ABt:
2657       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2658       break;
2659     case MATPRODUCT_PtAP:
2660     case MATPRODUCT_RARt:
2661     case MATPRODUCT_ABC:
2662       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2663       break;
2664     default:
2665       break;
2666     }
2667   } else { /* fallback for AIJ */
2668     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2669   }
2670   PetscFunctionReturn(0);
2671 }
2672 
2673 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2674 {
2675   PetscErrorCode ierr;
2676 
2677   PetscFunctionBegin;
2678   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2679   PetscFunctionReturn(0);
2680 }
2681 
2682 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2683 {
2684   PetscErrorCode ierr;
2685 
2686   PetscFunctionBegin;
2687   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2688   PetscFunctionReturn(0);
2689 }
2690 
2691 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2692 {
2693   PetscErrorCode ierr;
2694 
2695   PetscFunctionBegin;
2696   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2697   PetscFunctionReturn(0);
2698 }
2699 
2700 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2701 {
2702   PetscErrorCode ierr;
2703 
2704   PetscFunctionBegin;
2705   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2706   PetscFunctionReturn(0);
2707 }
2708 
2709 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2710 {
2711   PetscErrorCode ierr;
2712 
2713   PetscFunctionBegin;
2714   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2715   PetscFunctionReturn(0);
2716 }
2717 
2718 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2719 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
2720 {
2721   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2722   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2723   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2724   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2725   PetscErrorCode               ierr;
2726   cudaError_t                  cerr;
2727   cusparseStatus_t             stat;
2728   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2729   PetscBool                    compressed;
2730 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2731   PetscInt                     nx,ny;
2732 #endif
2733 
2734   PetscFunctionBegin;
2735   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2736   if (!a->nonzerorowcnt) {
2737     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2738     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2739     PetscFunctionReturn(0);
2740   }
2741   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
2742   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2743   if (!trans) {
2744     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2745     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2746   } else {
2747     if (herm || !cusparsestruct->transgen) {
2748       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2749       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2750     } else {
2751       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2752       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2753     }
2754   }
2755   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2756   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2757 
2758   try {
2759     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2760     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2761     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2762 
2763     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2764     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2765       /* z = A x + beta y.
2766          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2767          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2768       */
2769       xptr = xarray;
2770       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2771       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2772      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2773       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2774           allocated to accommodate different uses. So we get the length info directly from mat.
2775        */
2776       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2777         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2778         nx = mat->num_cols;
2779         ny = mat->num_rows;
2780       }
2781      #endif
2782     } else {
2783       /* z = A^T x + beta y
2784          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2785          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2786        */
2787       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2788       dptr = zarray;
2789       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2790       if (compressed) { /* Scatter x to work vector */
2791         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2792         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2793                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2794                          VecCUDAEqualsReverse());
2795       }
2796      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2797       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2798         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2799         nx = mat->num_rows;
2800         ny = mat->num_cols;
2801       }
2802      #endif
2803     }
2804 
2805     /* csr_spmv does y = alpha op(A) x + beta y */
2806     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2807      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2808       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2809       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2810         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2811         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2812         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2813                                 matstruct->matDescr,
2814                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2815                                 matstruct->cuSpMV[opA].vecYDescr,
2816                                 cusparse_scalartype,
2817                                 cusparsestruct->spmvAlg,
2818                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2819         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2820 
2821         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2822       } else {
2823         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2824         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2825         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2826       }
2827 
2828       stat = cusparseSpMV(cusparsestruct->handle, opA,
2829                                matstruct->alpha_one,
2830                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2831                                matstruct->cuSpMV[opA].vecXDescr,
2832                                beta,
2833                                matstruct->cuSpMV[opA].vecYDescr,
2834                                cusparse_scalartype,
2835                                cusparsestruct->spmvAlg,
2836                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2837      #else
2838       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2839       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2840                                mat->num_rows, mat->num_cols,
2841                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2842                                mat->values->data().get(), mat->row_offsets->data().get(),
2843                                mat->column_indices->data().get(), xptr, beta,
2844                                dptr);CHKERRCUSPARSE(stat);
2845      #endif
2846     } else {
2847       if (cusparsestruct->nrows) {
2848        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2850        #else
2851         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2852         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2853                                  matstruct->alpha_one, matstruct->descr, hybMat,
2854                                  xptr, beta,
2855                                  dptr);CHKERRCUSPARSE(stat);
2856        #endif
2857       }
2858     }
2859     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2860     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2861 
2862     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2863       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2864         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2865           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2866         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2867           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2868         }
2869       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2870         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
2871       }
2872 
2873       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2874       if (compressed) {
2875         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2876         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2877         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2878                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2879                          VecCUDAPlusEquals());
2880         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2881         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2882       }
2883     } else {
2884       if (yy && yy != zz) {
2885         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2886       }
2887     }
2888     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2889     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2890     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
2891   } catch(char *ex) {
2892     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
2893   }
2894   if (yy) {
2895     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2896   } else {
2897     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2898   }
2899   PetscFunctionReturn(0);
2900 }
2901 
2902 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2903 {
2904   PetscErrorCode ierr;
2905 
2906   PetscFunctionBegin;
2907   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2908   PetscFunctionReturn(0);
2909 }
2910 
2911 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
2912 {
2913   PetscErrorCode              ierr;
2914   PetscSplitCSRDataStructure  *d_mat = NULL;
2915   PetscFunctionBegin;
2916   if (A->factortype == MAT_FACTOR_NONE) {
2917     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2918   }
2919   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2920   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2921   if (d_mat) {
2922     A->offloadmask = PETSC_OFFLOAD_GPU;
2923   }
2924 
2925   PetscFunctionReturn(0);
2926 }
2927 
2928 /* --------------------------------------------------------------------------------*/
2929 /*@
2930    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2931    (the default parallel PETSc format). This matrix will ultimately pushed down
2932    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2933    assembly performance the user should preallocate the matrix storage by setting
2934    the parameter nz (or the array nnz).  By setting these parameters accurately,
2935    performance during matrix assembly can be increased by more than a factor of 50.
2936 
2937    Collective
2938 
2939    Input Parameters:
2940 +  comm - MPI communicator, set to PETSC_COMM_SELF
2941 .  m - number of rows
2942 .  n - number of columns
2943 .  nz - number of nonzeros per row (same for all rows)
2944 -  nnz - array containing the number of nonzeros in the various rows
2945          (possibly different for each row) or NULL
2946 
2947    Output Parameter:
2948 .  A - the matrix
2949 
2950    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
2951    MatXXXXSetPreallocation() paradgm instead of this routine directly.
2952    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
2953 
2954    Notes:
2955    If nnz is given then nz is ignored
2956 
2957    The AIJ format (also called the Yale sparse matrix format or
2958    compressed row storage), is fully compatible with standard Fortran 77
2959    storage.  That is, the stored row and column indices can begin at
2960    either one (as in Fortran) or zero.  See the users' manual for details.
2961 
2962    Specify the preallocated storage with either nz or nnz (not both).
2963    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
2964    allocation.  For large problems you MUST preallocate memory or you
2965    will get TERRIBLE performance, see the users' manual chapter on matrices.
2966 
2967    By default, this format uses inodes (identical nodes) when possible, to
2968    improve numerical efficiency of matrix-vector products and solves. We
2969    search for consecutive rows with the same nonzero structure, thereby
2970    reusing matrix information to achieve increased efficiency.
2971 
2972    Level: intermediate
2973 
2974 .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
2975 @*/
2976 PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
2977 {
2978   PetscErrorCode ierr;
2979 
2980   PetscFunctionBegin;
2981   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2982   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
2983   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2984   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
2985   PetscFunctionReturn(0);
2986 }
2987 
2988 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
2989 {
2990   PetscErrorCode              ierr;
2991   PetscSplitCSRDataStructure  *d_mat = NULL;
2992 
2993   PetscFunctionBegin;
2994   if (A->factortype == MAT_FACTOR_NONE) {
2995     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2996     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2997     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
2998   } else {
2999     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3000   }
3001   if (d_mat) {
3002     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3003     cudaError_t                err;
3004     PetscSplitCSRDataStructure h_mat;
3005     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
3006     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3007     if (a->compressedrow.use) {
3008       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
3009     }
3010     err = cudaFree(d_mat);CHKERRCUDA(err);
3011   }
3012   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3013   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3014   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3015   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3016   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
3017   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3018   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3019   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
3020   PetscFunctionReturn(0);
3021 }
3022 
3023 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
3024 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
3025 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
3026 {
3027   PetscErrorCode ierr;
3028 
3029   PetscFunctionBegin;
3030   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3031   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
3032   PetscFunctionReturn(0);
3033 }
3034 
3035 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
3036 {
3037   PetscErrorCode ierr;
3038   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3039   PetscBool      flgx,flgy;
3040 
3041   PetscFunctionBegin;
3042   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
3043   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
3044   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
3045   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
3046   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
3047   if (!flgx || !flgy) {
3048     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3049     PetscFunctionReturn(0);
3050   }
3051   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
3052   if (str == DIFFERENT_NONZERO_PATTERN) {
3053     if (x->nz == y->nz) {
3054       PetscBool e;
3055       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
3056       if (e) {
3057         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
3058         if (e) {
3059           str = SAME_NONZERO_PATTERN;
3060         }
3061       }
3062     }
3063   }
3064   if (str != SAME_NONZERO_PATTERN) {
3065     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3066     PetscFunctionReturn(0);
3067   } else {
3068     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3069     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
3070     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3071     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
3073       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
3074         ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3075       }
3076       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
3077         ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
3078       }
3079       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3080       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3081       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3082     } else {
3083       cublasHandle_t cublasv2handle;
3084       cublasStatus_t cberr;
3085       cudaError_t    err;
3086       PetscScalar    alpha = a;
3087       PetscBLASInt   one = 1, bnz = 1;
3088       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
3089       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
3090       PetscScalar    *aa_y, *aa_x;
3091       aa_y = matrix_y->values->data().get();
3092       aa_x = matrix_x->values->data().get();
3093       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3094       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3095       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3096       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
3097       err  = WaitForCUDA();CHKERRCUDA(err);
3098       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3099       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3100       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3101       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
3102       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
3103       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
3104       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3105     }
3106   }
3107   PetscFunctionReturn(0);
3108 }
3109 
3110 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3111 {
3112   PetscErrorCode             ierr;
3113   PetscBool                  both = PETSC_FALSE;
3114   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
3115 
3116   PetscFunctionBegin;
3117   if (A->factortype == MAT_FACTOR_NONE) {
3118     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
3119     if (spptr->mat) {
3120       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
3121       if (matrix->values) {
3122         both = PETSC_TRUE;
3123         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3124       }
3125     }
3126     if (spptr->matTranspose) {
3127       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
3128       if (matrix->values) {
3129         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3130       }
3131     }
3132   }
3133   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3134   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3135   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
3136   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3137   else A->offloadmask = PETSC_OFFLOAD_CPU;
3138 
3139   PetscFunctionReturn(0);
3140 }
3141 
3142 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3143 {
3144   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3145   PetscErrorCode ierr;
3146 
3147   PetscFunctionBegin;
3148   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3149   if (flg) {
3150     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3151 
3152     A->ops->axpy                      = MatAXPY_SeqAIJ;
3153     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3154     A->ops->mult                      = MatMult_SeqAIJ;
3155     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3156     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3157     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3158     A->ops->multhermitiantranspose    = NULL;
3159     A->ops->multhermitiantransposeadd = NULL;
3160     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3161     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3162     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3163     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3164     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3165     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3166     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3167   } else {
3168     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3169     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3170     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3171     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3172     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3173     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3174     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3175     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3176     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3177     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3178     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3179     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3180     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3181     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3182     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3183   }
3184   A->boundtocpu = flg;
3185   a->inode.use = flg;
3186   PetscFunctionReturn(0);
3187 }
3188 
3189 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
3190 {
3191   PetscErrorCode   ierr;
3192   cusparseStatus_t stat;
3193   Mat              B;
3194 
3195   PetscFunctionBegin;
3196   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
3197   if (reuse == MAT_INITIAL_MATRIX) {
3198     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
3199   } else if (reuse == MAT_REUSE_MATRIX) {
3200     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3201   }
3202   B = *newmat;
3203 
3204   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
3205   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
3206 
3207   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
3208     if (B->factortype == MAT_FACTOR_NONE) {
3209       Mat_SeqAIJCUSPARSE *spptr;
3210 
3211       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3212       spptr->format = MAT_CUSPARSE_CSR;
3213       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3214       B->spptr = spptr;
3215       spptr->deviceMat = NULL;
3216     } else {
3217       Mat_SeqAIJCUSPARSETriFactors *spptr;
3218 
3219       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3220       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3221       B->spptr = spptr;
3222     }
3223     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
3224   }
3225   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
3226   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3227   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
3228   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3229   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
3230 
3231   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
3232   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3233   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3234   PetscFunctionReturn(0);
3235 }
3236 
3237 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3238 {
3239   PetscErrorCode ierr;
3240 
3241   PetscFunctionBegin;
3242   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
3243   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3244   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3245   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3246   ierr = PetscOptionsEnd();CHKERRQ(ierr);
3247   PetscFunctionReturn(0);
3248 }
3249 
3250 /*MC
3251    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3252 
3253    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3254    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3255    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3256 
3257    Options Database Keys:
3258 +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3259 .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3260 -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3261 
3262   Level: beginner
3263 
3264 .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3265 M*/
3266 
3267 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
3268 
3269 
3270 PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3271 {
3272   PetscErrorCode ierr;
3273 
3274   PetscFunctionBegin;
3275   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3276   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3277   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3278   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3279   PetscFunctionReturn(0);
3280 }
3281 
3282 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3283 {
3284   PetscErrorCode   ierr;
3285   cusparseStatus_t stat;
3286 
3287   PetscFunctionBegin;
3288   if (*cusparsestruct) {
3289     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3290     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
3291     delete (*cusparsestruct)->workVector;
3292     delete (*cusparsestruct)->rowoffsets_gpu;
3293     delete (*cusparsestruct)->cooPerm;
3294     delete (*cusparsestruct)->cooPerm_a;
3295     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3296    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3297     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3298    #endif
3299     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
3300   }
3301   PetscFunctionReturn(0);
3302 }
3303 
3304 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3305 {
3306   PetscFunctionBegin;
3307   if (*mat) {
3308     delete (*mat)->values;
3309     delete (*mat)->column_indices;
3310     delete (*mat)->row_offsets;
3311     delete *mat;
3312     *mat = 0;
3313   }
3314   PetscFunctionReturn(0);
3315 }
3316 
3317 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3318 {
3319   cusparseStatus_t stat;
3320   PetscErrorCode   ierr;
3321 
3322   PetscFunctionBegin;
3323   if (*trifactor) {
3324     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3325     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
3326     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
3327     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
3328     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3329    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3330     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3331    #endif
3332     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
3333   }
3334   PetscFunctionReturn(0);
3335 }
3336 
3337 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
3338 {
3339   CsrMatrix        *mat;
3340   cusparseStatus_t stat;
3341   cudaError_t      err;
3342 
3343   PetscFunctionBegin;
3344   if (*matstruct) {
3345     if ((*matstruct)->mat) {
3346       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3347        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3348         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3349        #else
3350         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
3351         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3352        #endif
3353       } else {
3354         mat = (CsrMatrix*)(*matstruct)->mat;
3355         CsrMatrix_Destroy(&mat);
3356       }
3357     }
3358     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
3359     delete (*matstruct)->cprowIndices;
3360     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
3361     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
3362     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3363 
3364    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3365     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3366     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3367     for (int i=0; i<3; i++) {
3368       if (mdata->cuSpMV[i].initialized) {
3369         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3370         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3371         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3372       }
3373     }
3374    #endif
3375     delete *matstruct;
3376     *matstruct = NULL;
3377   }
3378   PetscFunctionReturn(0);
3379 }
3380 
3381 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3382 {
3383   PetscErrorCode ierr;
3384 
3385   PetscFunctionBegin;
3386   if (*trifactors) {
3387     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3388     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3389     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3390     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
3391     delete (*trifactors)->rpermIndices;
3392     delete (*trifactors)->cpermIndices;
3393     delete (*trifactors)->workVector;
3394     (*trifactors)->rpermIndices = NULL;
3395     (*trifactors)->cpermIndices = NULL;
3396     (*trifactors)->workVector = NULL;
3397   }
3398   PetscFunctionReturn(0);
3399 }
3400 
3401 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3402 {
3403   PetscErrorCode   ierr;
3404   cusparseHandle_t handle;
3405   cusparseStatus_t stat;
3406 
3407   PetscFunctionBegin;
3408   if (*trifactors) {
3409     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
3410     if (handle = (*trifactors)->handle) {
3411       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
3412     }
3413     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
3414   }
3415   PetscFunctionReturn(0);
3416 }
3417 
3418 struct IJCompare
3419 {
3420   __host__ __device__
3421   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3422   {
3423     if (t1.get<0>() < t2.get<0>()) return true;
3424     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3425     return false;
3426   }
3427 };
3428 
3429 struct IJEqual
3430 {
3431   __host__ __device__
3432   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3433   {
3434     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
3435     return true;
3436   }
3437 };
3438 
3439 struct IJDiff
3440 {
3441   __host__ __device__
3442   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3443   {
3444     return t1 == t2 ? 0 : 1;
3445   }
3446 };
3447 
3448 struct IJSum
3449 {
3450   __host__ __device__
3451   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
3452   {
3453     return t1||t2;
3454   }
3455 };
3456 
3457 #include <thrust/iterator/discard_iterator.h>
3458 PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
3459 {
3460   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3461   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3462   THRUSTARRAY                           *cooPerm_v = NULL,*cooPerm_w;
3463   thrust::device_ptr<const PetscScalar> d_v;
3464   CsrMatrix                             *matrix;
3465   PetscErrorCode                        ierr;
3466   cudaError_t                           cerr;
3467   PetscInt                              n;
3468 
3469   PetscFunctionBegin;
3470   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3471   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
3472   if (!cusp->cooPerm) {
3473     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3474     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
3475     PetscFunctionReturn(0);
3476   }
3477   matrix = (CsrMatrix*)cusp->mat->mat;
3478   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3479   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3480   if (!v) {
3481     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3482     goto finalize;
3483   }
3484   n = cusp->cooPerm->size();
3485   if (isCudaMem(v)) {
3486     d_v = thrust::device_pointer_cast(v);
3487   } else {
3488     cooPerm_v = new THRUSTARRAY(n);
3489     cooPerm_v->assign(v,v+n);
3490     d_v = cooPerm_v->data();
3491     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3492   }
3493   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3494     if (cusp->cooPerm_a) {
3495       cooPerm_w = new THRUSTARRAY(matrix->values->size());
3496       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3497       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3498       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3499       delete cooPerm_w;
3500     } else {
3501       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3502                                                                 matrix->values->begin()));
3503       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3504                                                                 matrix->values->end()));
3505       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
3506     }
3507   } else {
3508     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
3509       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3510       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3511     } else {
3512       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
3513                                                                 matrix->values->begin()));
3514       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
3515                                                                 matrix->values->end()));
3516       thrust::for_each(zibit,zieit,VecCUDAEquals());
3517     }
3518   }
3519   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3520   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
3521 finalize:
3522   delete cooPerm_v;
3523   A->offloadmask = PETSC_OFFLOAD_GPU;
3524   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3525   /* shorter version of MatAssemblyEnd_SeqAIJ */
3526   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3527   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3528   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3529   a->reallocs         = 0;
3530   A->info.mallocs    += 0;
3531   A->info.nz_unneeded = 0;
3532   A->assembled = A->was_assembled = PETSC_TRUE;
3533   A->num_ass++;
3534   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
3535   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3536   PetscFunctionReturn(0);
3537 }
3538 
3539 #include <thrust/binary_search.h>
3540 PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
3541 {
3542   PetscErrorCode     ierr;
3543   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3544   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3545   PetscInt           cooPerm_n, nzr = 0;
3546   cudaError_t        cerr;
3547 
3548   PetscFunctionBegin;
3549   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3550   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
3551   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
3552   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
3553   if (n != cooPerm_n) {
3554     delete cusp->cooPerm;
3555     delete cusp->cooPerm_a;
3556     cusp->cooPerm = NULL;
3557     cusp->cooPerm_a = NULL;
3558   }
3559   if (n) {
3560     THRUSTINTARRAY d_i(n);
3561     THRUSTINTARRAY d_j(n);
3562     THRUSTINTARRAY ii(A->rmap->n);
3563 
3564     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
3565     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
3566 
3567     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
3568     d_i.assign(coo_i,coo_i+n);
3569     d_j.assign(coo_j,coo_j+n);
3570     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
3571     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
3572 
3573     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3574     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3575     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
3576     *cusp->cooPerm_a = d_i;
3577     THRUSTINTARRAY w = d_j;
3578 
3579     auto nekey = thrust::unique(fkey, ekey, IJEqual());
3580     if (nekey == ekey) { /* all entries are unique */
3581       delete cusp->cooPerm_a;
3582       cusp->cooPerm_a = NULL;
3583     } else { /* I couldn't come up with a more elegant algorithm */
3584       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
3585       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
3586       (*cusp->cooPerm_a)[0] = 0;
3587       w[0] = 0;
3588       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
3589       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
3590     }
3591     thrust::counting_iterator<PetscInt> search_begin(0);
3592     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
3593                         search_begin, search_begin + A->rmap->n,
3594                         ii.begin());
3595     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3596     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3597 
3598     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
3599     a->singlemalloc = PETSC_FALSE;
3600     a->free_a       = PETSC_TRUE;
3601     a->free_ij      = PETSC_TRUE;
3602     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3603     a->i[0] = 0;
3604     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3605     a->nz = a->maxnz = a->i[A->rmap->n];
3606     a->rmax = 0;
3607     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
3608     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
3609     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3610     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
3611     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
3612     for (PetscInt i = 0; i < A->rmap->n; i++) {
3613       const PetscInt nnzr = a->i[i+1] - a->i[i];
3614       nzr += (PetscInt)!!(nnzr);
3615       a->ilen[i] = a->imax[i] = nnzr;
3616       a->rmax = PetscMax(a->rmax,nnzr);
3617     }
3618     a->nonzerorowcnt = nzr;
3619     A->preallocated = PETSC_TRUE;
3620     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3621     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
3622   } else {
3623     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
3624   }
3625   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
3626   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
3627 
3628   /* We want to allocate the CUSPARSE struct for matvec now.
3629      The code is so convoluted now that I prefer to copy zeros */
3630   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
3631   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
3632   A->offloadmask = PETSC_OFFLOAD_CPU;
3633   A->nonzerostate++;
3634   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3635   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3636 
3637   A->assembled = PETSC_FALSE;
3638   A->was_assembled = PETSC_FALSE;
3639   PetscFunctionReturn(0);
3640 }
3641 
3642 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3643 {
3644   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3645   CsrMatrix          *csr;
3646   PetscErrorCode     ierr;
3647 
3648   PetscFunctionBegin;
3649   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3650   PetscValidPointer(a,2);
3651   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3652   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3653   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3654   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3655   csr = (CsrMatrix*)cusp->mat->mat;
3656   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3657   *a = csr->values->data().get();
3658   PetscFunctionReturn(0);
3659 }
3660 
3661 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3662 {
3663   PetscFunctionBegin;
3664   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3665   PetscValidPointer(a,2);
3666   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3667   *a = NULL;
3668   PetscFunctionReturn(0);
3669 }
3670 
3671 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3672 {
3673   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3674   CsrMatrix          *csr;
3675 
3676   PetscFunctionBegin;
3677   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3678   PetscValidPointer(a,2);
3679   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3680   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3681   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3682   csr = (CsrMatrix*)cusp->mat->mat;
3683   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3684   *a = csr->values->data().get();
3685   PetscFunctionReturn(0);
3686 }
3687 
3688 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3689 {
3690   PetscErrorCode ierr;
3691 
3692   PetscFunctionBegin;
3693   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3694   PetscValidPointer(a,2);
3695   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3696   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3697   A->offloadmask = PETSC_OFFLOAD_GPU;
3698   *a = NULL;
3699   PetscFunctionReturn(0);
3700 }
3701 
3702 struct IJCompare4
3703 {
3704   __host__ __device__
3705   inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2)
3706   {
3707     if (t1.get<0>() < t2.get<0>()) return true;
3708     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3709     return false;
3710   }
3711 };
3712 
3713 struct Shift
3714 {
3715   int _shift;
3716 
3717   Shift(int shift) : _shift(shift) {}
3718   __host__ __device__
3719   inline int operator() (const int &c)
3720   {
3721     return c + _shift;
3722   }
3723 };
3724 
3725 /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3726 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3727 {
3728   PetscErrorCode               ierr;
3729   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3730   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3731   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3732   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3733   PetscInt                     Annz,Bnnz;
3734   cusparseStatus_t             stat;
3735   PetscInt                     i,m,n,zero = 0;
3736   cudaError_t                  cerr;
3737 
3738   PetscFunctionBegin;
3739   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3740   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3741   PetscValidPointer(C,4);
3742   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3743   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3744   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3745   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3746   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3747   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3748   if (reuse == MAT_INITIAL_MATRIX) {
3749     m     = A->rmap->n;
3750     n     = A->cmap->n + B->cmap->n;
3751     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3752     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3753     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3754     c     = (Mat_SeqAIJ*)(*C)->data;
3755     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3756     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3757     Ccsr  = new CsrMatrix;
3758     Cmat->cprowIndices      = NULL;
3759     c->compressedrow.use    = PETSC_FALSE;
3760     c->compressedrow.nrows  = 0;
3761     c->compressedrow.i      = NULL;
3762     c->compressedrow.rindex = NULL;
3763     Ccusp->workVector       = NULL;
3764     Ccusp->nrows    = m;
3765     Ccusp->mat      = Cmat;
3766     Ccusp->mat->mat = Ccsr;
3767     Ccsr->num_rows  = m;
3768     Ccsr->num_cols  = n;
3769     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3770     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3771     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3772     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3773     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3774     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3775     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3776     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3777     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3778     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3779     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3780     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3781     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3782     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3783     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3784 
3785     Acsr = (CsrMatrix*)Acusp->mat->mat;
3786     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3787     Annz = (PetscInt)Acsr->column_indices->size();
3788     Bnnz = (PetscInt)Bcsr->column_indices->size();
3789     c->nz = Annz + Bnnz;
3790     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3791     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3792     Ccsr->values = new THRUSTARRAY(c->nz);
3793     Ccsr->num_entries = c->nz;
3794     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3795 
3796     if (c->nz) {
3797       THRUSTINTARRAY32 Acoo(Annz);
3798       THRUSTINTARRAY32 Bcoo(Bnnz);
3799       THRUSTINTARRAY32 *roff;
3800       if (a->compressedrow.use) { /* need full row offset */
3801         if (!Acusp->rowoffsets_gpu) {
3802           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3803           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3804           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3805         }
3806         roff = Acusp->rowoffsets_gpu;
3807       } else roff = Acsr->row_offsets;
3808       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3809       stat = cusparseXcsr2coo(Acusp->handle,
3810                               roff->data().get(),
3811                               Annz,
3812                               m,
3813                               Acoo.data().get(),
3814                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3815       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3816       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3817       if (b->compressedrow.use) { /* need full row offset */
3818         if (!Bcusp->rowoffsets_gpu) {
3819           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3820           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3821           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3822         }
3823         roff = Bcusp->rowoffsets_gpu;
3824       } else roff = Bcsr->row_offsets;
3825       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3826       stat = cusparseXcsr2coo(Bcusp->handle,
3827                               roff->data().get(),
3828                               Bnnz,
3829                               m,
3830                               Bcoo.data().get(),
3831                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3832       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3833       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3834       THRUSTINTARRAY32 Ccoo(c->nz);
3835       auto Aperm = thrust::make_constant_iterator(true);
3836       auto Bperm = thrust::make_constant_iterator(false);
3837       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3838       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
3839       thrust::device_vector<bool> wPerm(Annz+Bnnz);
3840       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
3841       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
3842       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm));
3843       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm));
3844       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin()));
3845       auto p1 = Ccusp->cooPerm->begin();
3846       auto p2 = Ccusp->cooPerm->begin();
3847       thrust::advance(p2,Annz);
3848       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3849       thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4());
3850       thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>());
3851       stat = cusparseXcoo2csr(Ccusp->handle,
3852                               Ccoo.data().get(),
3853                               c->nz,
3854                               m,
3855                               Ccsr->row_offsets->data().get(),
3856                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3857       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3858       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3859 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3860       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3861                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3862                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3863                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3864 #endif
3865       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3866         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3867         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3868         CsrMatrix *CcsrT = new CsrMatrix;
3869         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3870         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3871 
3872         Ccusp->transgen = PETSC_TRUE;
3873         CmatT->cprowIndices  = NULL;
3874         CmatT->mat = CcsrT;
3875         CcsrT->num_rows = n;
3876         CcsrT->num_cols = m;
3877         CcsrT->num_entries = c->nz;
3878 
3879         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3880         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3881         CcsrT->values = new THRUSTARRAY(c->nz);
3882 
3883         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3884         auto rT = CcsrT->row_offsets->begin();
3885         if (AT) {
3886           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3887           thrust::advance(rT,-1);
3888         }
3889         if (BT) {
3890           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3891           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3892           thrust::copy(titb,tite,rT);
3893         }
3894         auto cT = CcsrT->column_indices->begin();
3895         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3896         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3897         auto vT = CcsrT->values->begin();
3898         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3899         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3900         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3901         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3902 
3903         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3904         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3905         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3906         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3907         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3908         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3909         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3910         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3911         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3912 #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3913         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3914                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3915                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3916                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3917 #endif
3918         Ccusp->matTranspose = CmatT;
3919       }
3920     }
3921 
3922     c->singlemalloc = PETSC_FALSE;
3923     c->free_a       = PETSC_TRUE;
3924     c->free_ij      = PETSC_TRUE;
3925     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
3926     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
3927     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3928       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3929       THRUSTINTARRAY jj(Ccsr->column_indices->size());
3930       ii   = *Ccsr->row_offsets;
3931       jj   = *Ccsr->column_indices;
3932       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3933       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3934     } else {
3935       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3936       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
3937     }
3938     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
3939     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
3940     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
3941     c->maxnz = c->nz;
3942     c->nonzerorowcnt = 0;
3943     c->rmax = 0;
3944     for (i = 0; i < m; i++) {
3945       const PetscInt nn = c->i[i+1] - c->i[i];
3946       c->ilen[i] = c->imax[i] = nn;
3947       c->nonzerorowcnt += (PetscInt)!!nn;
3948       c->rmax = PetscMax(c->rmax,nn);
3949     }
3950     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
3951     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
3952     (*C)->nonzerostate++;
3953     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
3954     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
3955     Ccusp->nonzerostate = (*C)->nonzerostate;
3956     (*C)->preallocated  = PETSC_TRUE;
3957   } else {
3958     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
3959     c = (Mat_SeqAIJ*)(*C)->data;
3960     if (c->nz) {
3961       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3962       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
3963       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3964       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
3965       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3966       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3967       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3968       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3969       Acsr = (CsrMatrix*)Acusp->mat->mat;
3970       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3971       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
3972       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
3973       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
3974       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
3975       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
3976       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
3977       auto pmid = Ccusp->cooPerm->begin();
3978       thrust::advance(pmid,Acsr->num_entries);
3979       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3980       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
3981                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
3982       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
3983                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3984       thrust::for_each(zibait,zieait,VecCUDAEquals());
3985       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
3986                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
3987       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
3988                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
3989       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
3990       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
3991         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
3992         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3993         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3994         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3995         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
3996         auto vT = CcsrT->values->begin();
3997         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3998         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3999       }
4000       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4001       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4002     }
4003   }
4004   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4005   (*C)->assembled     = PETSC_TRUE;
4006   (*C)->was_assembled = PETSC_FALSE;
4007   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4008   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
4009   ierr = MatSeqAIJCUSPARSECopyFromGPU(*C);CHKERRQ(ierr);
4010   PetscFunctionReturn(0);
4011 }
4012