xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 9a14fc28d5b19bcf2bea9b2970694c7b45143db0)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
16e8d2b73aSMark Adams 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8657181aedSStefano Zampini 
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
897e8381f9SStefano Zampini 
90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91c215019aSStefano Zampini 
92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93b06137fdSPaul Mullowney {
94b06137fdSPaul Mullowney   cusparseStatus_t   stat;
95b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney   PetscFunctionBegin;
98d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10057d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101b06137fdSPaul Mullowney   PetscFunctionReturn(0);
102b06137fdSPaul Mullowney }
103b06137fdSPaul Mullowney 
104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105b06137fdSPaul Mullowney {
106b06137fdSPaul Mullowney   cusparseStatus_t   stat;
107b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney   PetscFunctionBegin;
110d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1116b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11216a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11357d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11416a2e217SAlejandro Lamas Daviña     }
115b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1166b1cf21dSAlejandro Lamas Daviña   }
11757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118b06137fdSPaul Mullowney   PetscFunctionReturn(0);
119b06137fdSPaul Mullowney }
120b06137fdSPaul Mullowney 
121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122b06137fdSPaul Mullowney {
123b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1247e8381f9SStefano Zampini   PetscBool          flg;
1257e8381f9SStefano Zampini   PetscErrorCode     ierr;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1287e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
1569ae82921SPaul Mullowney   PetscErrorCode ierr;
157bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1589ae82921SPaul Mullowney 
1599ae82921SPaul Mullowney   PetscFunctionBegin;
160bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1622c7c0729SBarry Smith   (*B)->factortype = ftype;
1639ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642205254eSKarl Rupp 
1659c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16733d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1689c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1699ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1709ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1719c1083e7SRichard Tran Mills     } else {
1729c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1739c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1749c1083e7SRichard Tran Mills     }
1754ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1774ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1799c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
180087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1829c1083e7SRichard Tran Mills     } else {
1839c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1849c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1859c1083e7SRichard Tran Mills     }
1864ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1874ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1889ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189bc3f50f2SPaul Mullowney 
190fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1914ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1923ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1939ae82921SPaul Mullowney   PetscFunctionReturn(0);
1949ae82921SPaul Mullowney }
1959ae82921SPaul Mullowney 
196bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197ca45077fSPaul Mullowney {
198aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1996e111a19SKarl Rupp 
200ca45077fSPaul Mullowney   PetscFunctionBegin;
201ca45077fSPaul Mullowney   switch (op) {
202e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
203aa372e3fSPaul Mullowney     cusparsestruct->format = format;
204ca45077fSPaul Mullowney     break;
205e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
206aa372e3fSPaul Mullowney     cusparsestruct->format = format;
207ca45077fSPaul Mullowney     break;
208ca45077fSPaul Mullowney   default:
20936d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210ca45077fSPaul Mullowney   }
211ca45077fSPaul Mullowney   PetscFunctionReturn(0);
212ca45077fSPaul Mullowney }
2139ae82921SPaul Mullowney 
214e057df02SPaul Mullowney /*@
215e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
217aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
218e057df02SPaul Mullowney    Not Collective
219e057df02SPaul Mullowney 
220e057df02SPaul Mullowney    Input Parameters:
2218468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2232692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224e057df02SPaul Mullowney 
225e057df02SPaul Mullowney    Output Parameter:
226e057df02SPaul Mullowney 
227e057df02SPaul Mullowney    Level: intermediate
228e057df02SPaul Mullowney 
2298468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230e057df02SPaul Mullowney @*/
231e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232e057df02SPaul Mullowney {
233e057df02SPaul Mullowney   PetscErrorCode ierr;
2346e111a19SKarl Rupp 
235e057df02SPaul Mullowney   PetscFunctionBegin;
236e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238e057df02SPaul Mullowney   PetscFunctionReturn(0);
239e057df02SPaul Mullowney }
240e057df02SPaul Mullowney 
241365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
242365b711fSMark Adams {
243365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
244365b711fSMark Adams 
245365b711fSMark Adams   PetscFunctionBegin;
246365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
247365b711fSMark Adams   PetscFunctionReturn(0);
248365b711fSMark Adams }
249365b711fSMark Adams 
250365b711fSMark Adams /*@
251365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
252365b711fSMark Adams 
253365b711fSMark Adams    Input Parameters:
254365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
255365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
256365b711fSMark Adams 
257365b711fSMark Adams    Output Parameter:
258365b711fSMark Adams 
259365b711fSMark Adams    Notes:
260365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
261365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
262365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
263365b711fSMark Adams 
264365b711fSMark Adams    Level: intermediate
265365b711fSMark Adams 
266365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
267365b711fSMark Adams @*/
268365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
269365b711fSMark Adams {
270365b711fSMark Adams   PetscErrorCode ierr;
271365b711fSMark Adams 
272365b711fSMark Adams   PetscFunctionBegin;
273365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
274365b711fSMark Adams   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
275365b711fSMark Adams   PetscFunctionReturn(0);
276365b711fSMark Adams }
277365b711fSMark Adams 
2781a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
279e6e9a74fSStefano Zampini {
280e6e9a74fSStefano Zampini   PetscErrorCode ierr;
281e6e9a74fSStefano Zampini 
282e6e9a74fSStefano Zampini   PetscFunctionBegin;
2831a2c6b5cSJunchao Zhang   switch (op) {
2841a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2851a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2861a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2871a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2881a2c6b5cSJunchao Zhang       break;
2891a2c6b5cSJunchao Zhang     default:
2901a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2911a2c6b5cSJunchao Zhang       break;
292e6e9a74fSStefano Zampini   }
293e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
294e6e9a74fSStefano Zampini }
295e6e9a74fSStefano Zampini 
296bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
297bddcd29dSMark Adams 
298bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
299bddcd29dSMark Adams {
300bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
301bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
302bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
303365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
304bddcd29dSMark Adams   PetscErrorCode ierr;
305bddcd29dSMark Adams 
306bddcd29dSMark Adams   PetscFunctionBegin;
307bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
308bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
309bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
310bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
311bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
312bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
313bddcd29dSMark Adams   if (row_identity && col_identity) {
314365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
315bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
316bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
317365b711fSMark Adams     }
318bddcd29dSMark Adams     B->ops->matsolve = NULL;
319bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
320bddcd29dSMark Adams   } else {
321365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
322bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
323bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
324365b711fSMark Adams     }
325bddcd29dSMark Adams     B->ops->matsolve = NULL;
326bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
327bddcd29dSMark Adams   }
328bddcd29dSMark Adams 
329bddcd29dSMark Adams   /* get the triangular factors */
330365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
331bddcd29dSMark Adams     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
332365b711fSMark Adams   }
333bddcd29dSMark Adams   PetscFunctionReturn(0);
334bddcd29dSMark Adams }
335bddcd29dSMark Adams 
3364416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3379ae82921SPaul Mullowney {
3389ae82921SPaul Mullowney   PetscErrorCode           ierr;
339e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3409ae82921SPaul Mullowney   PetscBool                flg;
341a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3426e111a19SKarl Rupp 
3439ae82921SPaul Mullowney   PetscFunctionBegin;
344e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3459ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
346e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
347a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
348afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
349afb2bd1cSJunchao Zhang 
3504c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
351a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
352afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
353365b711fSMark Adams     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
354365b711fSMark Adams     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
355afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
356afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
357afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
358afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
359a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
360a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
361a435da06SStefano Zampini #else
362afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
363a435da06SStefano Zampini #endif
364afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
365afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
366afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
367afb2bd1cSJunchao Zhang 
368afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
369afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
370afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
371afb2bd1cSJunchao Zhang    #endif
3724c87dfd4SPaul Mullowney   }
3730af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3749ae82921SPaul Mullowney   PetscFunctionReturn(0);
3759ae82921SPaul Mullowney }
3769ae82921SPaul Mullowney 
3776fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3789ae82921SPaul Mullowney {
379da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3809ae82921SPaul Mullowney   PetscErrorCode               ierr;
3819ae82921SPaul Mullowney 
3829ae82921SPaul Mullowney   PetscFunctionBegin;
383da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3849ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3859ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3869ae82921SPaul Mullowney   PetscFunctionReturn(0);
3879ae82921SPaul Mullowney }
3889ae82921SPaul Mullowney 
3896fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3909ae82921SPaul Mullowney {
391da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3929ae82921SPaul Mullowney   PetscErrorCode               ierr;
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney   PetscFunctionBegin;
395da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3969ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3979ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3989ae82921SPaul Mullowney   PetscFunctionReturn(0);
3999ae82921SPaul Mullowney }
4009ae82921SPaul Mullowney 
401087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
402087f3262SPaul Mullowney {
403da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
404087f3262SPaul Mullowney   PetscErrorCode               ierr;
405087f3262SPaul Mullowney 
406087f3262SPaul Mullowney   PetscFunctionBegin;
407da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
408087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
409087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410087f3262SPaul Mullowney   PetscFunctionReturn(0);
411087f3262SPaul Mullowney }
412087f3262SPaul Mullowney 
413087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
414087f3262SPaul Mullowney {
415da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
416087f3262SPaul Mullowney   PetscErrorCode               ierr;
417087f3262SPaul Mullowney 
418087f3262SPaul Mullowney   PetscFunctionBegin;
419da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
420087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
421087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
422087f3262SPaul Mullowney   PetscFunctionReturn(0);
423087f3262SPaul Mullowney }
424087f3262SPaul Mullowney 
425087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4269ae82921SPaul Mullowney {
4279ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4289ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4299ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
430aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4319ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4329ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4339ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4349ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4359ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
436b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
43757d48284SJunchao Zhang   cudaError_t                       cerr;
4389ae82921SPaul Mullowney 
4399ae82921SPaul Mullowney   PetscFunctionBegin;
440cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
441c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4429ae82921SPaul Mullowney     try {
4439ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4449ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
445da79fbbcSStefano Zampini       if (!loTriFactor) {
4462cbc15d9SMark         PetscScalar                       *AALo;
4472cbc15d9SMark 
4482cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4499ae82921SPaul Mullowney 
4509ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
45157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
45257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4539ae82921SPaul Mullowney 
4549ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4559ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4569ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4579ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4589ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4599ae82921SPaul Mullowney         v        = aa;
4609ae82921SPaul Mullowney         vi       = aj;
4619ae82921SPaul Mullowney         offset   = 1;
4629ae82921SPaul Mullowney         rowOffset= 1;
4639ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4649ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
465e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4669ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4679ae82921SPaul Mullowney           rowOffset += nz+1;
4689ae82921SPaul Mullowney 
469580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
470580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4719ae82921SPaul Mullowney 
4729ae82921SPaul Mullowney           offset      += nz;
4739ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4749ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4759ae82921SPaul Mullowney           offset      += 1;
4769ae82921SPaul Mullowney 
4779ae82921SPaul Mullowney           v  += nz;
4789ae82921SPaul Mullowney           vi += nz;
4799ae82921SPaul Mullowney         }
4802205254eSKarl Rupp 
481aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
482da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
483da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
484aa372e3fSPaul Mullowney         /* Create the matrix description */
48557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
48657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4871b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
488afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
489afb2bd1cSJunchao Zhang        #else
49057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
491afb2bd1cSJunchao Zhang        #endif
49257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
49357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
494aa372e3fSPaul Mullowney 
495aa372e3fSPaul Mullowney         /* set the operation */
496aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
497aa372e3fSPaul Mullowney 
498aa372e3fSPaul Mullowney         /* set the matrix */
499aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
500aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
501aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
502aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
503aa372e3fSPaul Mullowney 
504aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
505aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
506aa372e3fSPaul Mullowney 
507aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
508aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
509aa372e3fSPaul Mullowney 
510aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
511aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
512aa372e3fSPaul Mullowney 
513afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
514da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
515afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5161b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
517afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
518afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
519afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
520afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
521afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
522afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
523afb2bd1cSJunchao Zhang       #endif
524afb2bd1cSJunchao Zhang 
525aa372e3fSPaul Mullowney         /* perform the solve analysis */
526aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
527aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
528aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
529d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
5301b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
531d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
532d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
533d49cd2b7SBarry Smith                                #else
534d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
535afb2bd1cSJunchao Zhang                                #endif
536da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
537da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
538aa372e3fSPaul Mullowney 
539da79fbbcSStefano Zampini         /* assign the pointer */
540aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5412cbc15d9SMark         loTriFactor->AA_h = AALo;
54257d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
54357d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
5444863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
545da79fbbcSStefano Zampini       } else { /* update values only */
5462cbc15d9SMark         if (!loTriFactor->AA_h) {
5472cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5482cbc15d9SMark         }
549da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5502cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
551da79fbbcSStefano Zampini         v        = aa;
552da79fbbcSStefano Zampini         vi       = aj;
553da79fbbcSStefano Zampini         offset   = 1;
554da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
555da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5562cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
557da79fbbcSStefano Zampini           offset      += nz;
5582cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
559da79fbbcSStefano Zampini           offset      += 1;
560da79fbbcSStefano Zampini           v  += nz;
561da79fbbcSStefano Zampini         }
5622cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
563da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
564da79fbbcSStefano Zampini       }
5659ae82921SPaul Mullowney     } catch(char *ex) {
5669ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5679ae82921SPaul Mullowney     }
5689ae82921SPaul Mullowney   }
5699ae82921SPaul Mullowney   PetscFunctionReturn(0);
5709ae82921SPaul Mullowney }
5719ae82921SPaul Mullowney 
572087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5739ae82921SPaul Mullowney {
5749ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5759ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5769ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
577aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5789ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5799ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5809ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5819ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5829ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5839ae82921SPaul Mullowney   PetscErrorCode                    ierr;
58457d48284SJunchao Zhang   cudaError_t                       cerr;
5859ae82921SPaul Mullowney 
5869ae82921SPaul Mullowney   PetscFunctionBegin;
587cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
588c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5899ae82921SPaul Mullowney     try {
5909ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5919ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
592da79fbbcSStefano Zampini       if (!upTriFactor) {
5932cbc15d9SMark         PetscScalar *AAUp;
5942cbc15d9SMark 
5952cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5962cbc15d9SMark 
5979ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
59857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
59957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
6009ae82921SPaul Mullowney 
6019ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
6029ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
6039ae82921SPaul Mullowney         AiUp[n]=nzUpper;
6049ae82921SPaul Mullowney         offset = nzUpper;
6059ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
6069ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
6079ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
6089ae82921SPaul Mullowney 
609e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
6109ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
6119ae82921SPaul Mullowney 
612e057df02SPaul Mullowney           /* decrement the offset */
6139ae82921SPaul Mullowney           offset -= (nz+1);
6149ae82921SPaul Mullowney 
615e057df02SPaul Mullowney           /* first, set the diagonal elements */
6169ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
61709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6189ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6199ae82921SPaul Mullowney 
620580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
621580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
6229ae82921SPaul Mullowney         }
6232205254eSKarl Rupp 
624aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
625da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
626da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6272205254eSKarl Rupp 
628aa372e3fSPaul Mullowney         /* Create the matrix description */
62957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
63057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
6311b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
632afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
633afb2bd1cSJunchao Zhang        #else
63457d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
635afb2bd1cSJunchao Zhang        #endif
63657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
63757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
638aa372e3fSPaul Mullowney 
639aa372e3fSPaul Mullowney         /* set the operation */
640aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
641aa372e3fSPaul Mullowney 
642aa372e3fSPaul Mullowney         /* set the matrix */
643aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
644aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
645aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
646aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
647aa372e3fSPaul Mullowney 
648aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
650aa372e3fSPaul Mullowney 
651aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
652aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
653aa372e3fSPaul Mullowney 
654aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
655aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
656aa372e3fSPaul Mullowney 
657afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
658da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
659afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6601b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
661afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
662afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
663afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
664afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
665afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
666afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
667afb2bd1cSJunchao Zhang       #endif
668afb2bd1cSJunchao Zhang 
669aa372e3fSPaul Mullowney         /* perform the solve analysis */
670aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
671aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
672aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
673d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6741b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
675d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
676d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
677d49cd2b7SBarry Smith                                #else
678d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
679afb2bd1cSJunchao Zhang                                #endif
680da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
681da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
682aa372e3fSPaul Mullowney 
683da79fbbcSStefano Zampini         /* assign the pointer */
684aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6852cbc15d9SMark         upTriFactor->AA_h = AAUp;
68657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
68757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6884863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
689da79fbbcSStefano Zampini       } else {
6902cbc15d9SMark         if (!upTriFactor->AA_h) {
6912cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6922cbc15d9SMark         }
693da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
694da79fbbcSStefano Zampini         offset = nzUpper;
695da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
696da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
697da79fbbcSStefano Zampini 
698da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
699da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
700da79fbbcSStefano Zampini 
701da79fbbcSStefano Zampini           /* decrement the offset */
702da79fbbcSStefano Zampini           offset -= (nz+1);
703da79fbbcSStefano Zampini 
704da79fbbcSStefano Zampini           /* first, set the diagonal elements */
7052cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
7062cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
707da79fbbcSStefano Zampini         }
7082cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
709da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
710da79fbbcSStefano Zampini       }
7119ae82921SPaul Mullowney     } catch(char *ex) {
7129ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7139ae82921SPaul Mullowney     }
7149ae82921SPaul Mullowney   }
7159ae82921SPaul Mullowney   PetscFunctionReturn(0);
7169ae82921SPaul Mullowney }
7179ae82921SPaul Mullowney 
718087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7199ae82921SPaul Mullowney {
7209ae82921SPaul Mullowney   PetscErrorCode               ierr;
7219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7239ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7249ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7259ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7269ae82921SPaul Mullowney 
7279ae82921SPaul Mullowney   PetscFunctionBegin;
728da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
729087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
730087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
7312205254eSKarl Rupp 
732da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
733aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7349ae82921SPaul Mullowney 
735c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
736e057df02SPaul Mullowney   /* lower triangular indices */
7379ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
738da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
739da79fbbcSStefano Zampini     const PetscInt *r;
740da79fbbcSStefano Zampini 
741da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
742aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
743aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7449ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
745da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
746da79fbbcSStefano Zampini   }
7479ae82921SPaul Mullowney 
748e057df02SPaul Mullowney   /* upper triangular indices */
7499ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
750da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
751da79fbbcSStefano Zampini     const PetscInt *c;
752da79fbbcSStefano Zampini 
753da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
754aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
755aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7569ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
757da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
758da79fbbcSStefano Zampini   }
7599ae82921SPaul Mullowney   PetscFunctionReturn(0);
7609ae82921SPaul Mullowney }
7619ae82921SPaul Mullowney 
762087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
763087f3262SPaul Mullowney {
764087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
765087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
766aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
767aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
768087f3262SPaul Mullowney   cusparseStatus_t                  stat;
769087f3262SPaul Mullowney   PetscErrorCode                    ierr;
77057d48284SJunchao Zhang   cudaError_t                       cerr;
771087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
772087f3262SPaul Mullowney   PetscScalar                       *AAUp;
773087f3262SPaul Mullowney   PetscScalar                       *AALo;
774087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
775087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
776087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
777087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
778087f3262SPaul Mullowney 
779087f3262SPaul Mullowney   PetscFunctionBegin;
780cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
781c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
782087f3262SPaul Mullowney     try {
783da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
784da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
785da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
786087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
78757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
78857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
789087f3262SPaul Mullowney 
790087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
791087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
792087f3262SPaul Mullowney         AiUp[n]=nzUpper;
793087f3262SPaul Mullowney         offset = 0;
794087f3262SPaul Mullowney         for (i=0; i<n; i++) {
795087f3262SPaul Mullowney           /* set the pointers */
796087f3262SPaul Mullowney           v  = aa + ai[i];
797087f3262SPaul Mullowney           vj = aj + ai[i];
798087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
799087f3262SPaul Mullowney 
800087f3262SPaul Mullowney           /* first, set the diagonal elements */
801087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
80209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
803087f3262SPaul Mullowney           AiUp[i]      = offset;
80409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
805087f3262SPaul Mullowney 
806087f3262SPaul Mullowney           offset+=1;
807087f3262SPaul Mullowney           if (nz>0) {
808f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
809580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
810087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
811087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
812087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
813087f3262SPaul Mullowney             }
814087f3262SPaul Mullowney             offset+=nz;
815087f3262SPaul Mullowney           }
816087f3262SPaul Mullowney         }
817087f3262SPaul Mullowney 
818aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
819da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
820da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821087f3262SPaul Mullowney 
822aa372e3fSPaul Mullowney         /* Create the matrix description */
82357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
82457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8251b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
826afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
827afb2bd1cSJunchao Zhang        #else
82857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
829afb2bd1cSJunchao Zhang        #endif
83057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
832087f3262SPaul Mullowney 
833aa372e3fSPaul Mullowney         /* set the matrix */
834aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
835aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
836aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
837aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
838aa372e3fSPaul Mullowney 
839aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
847aa372e3fSPaul Mullowney 
848afb2bd1cSJunchao Zhang         /* set the operation */
849afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
850afb2bd1cSJunchao Zhang 
851afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
852da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
853afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8541b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
855afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
856afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
857afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
858afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
859afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
860afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
861afb2bd1cSJunchao Zhang       #endif
862afb2bd1cSJunchao Zhang 
863aa372e3fSPaul Mullowney         /* perform the solve analysis */
864aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
865aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
866aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
867d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8681b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
870d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
871d49cd2b7SBarry Smith                                 #else
872d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
873afb2bd1cSJunchao Zhang                                 #endif
874da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
875da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
876aa372e3fSPaul Mullowney 
877da79fbbcSStefano Zampini         /* assign the pointer */
878aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
879aa372e3fSPaul Mullowney 
880aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
881da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
882da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
883aa372e3fSPaul Mullowney 
884aa372e3fSPaul Mullowney         /* Create the matrix description */
88557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
88657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8871b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
888afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
889afb2bd1cSJunchao Zhang        #else
89057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
891afb2bd1cSJunchao Zhang        #endif
89257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
89357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
894aa372e3fSPaul Mullowney 
895aa372e3fSPaul Mullowney         /* set the operation */
896aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
897aa372e3fSPaul Mullowney 
898aa372e3fSPaul Mullowney         /* set the matrix */
899aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
901aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
903aa372e3fSPaul Mullowney 
904aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
906aa372e3fSPaul Mullowney 
907aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
911aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
912aa372e3fSPaul Mullowney 
913afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
914da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
915afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
9161b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
917afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
918afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
919afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
920afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
921afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
922afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
923afb2bd1cSJunchao Zhang       #endif
924afb2bd1cSJunchao Zhang 
925aa372e3fSPaul Mullowney         /* perform the solve analysis */
926aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
927aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
928aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
929d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
9301b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
931d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
932d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
933d49cd2b7SBarry Smith                                 #else
934d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
935afb2bd1cSJunchao Zhang                                 #endif
936da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
937da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
938aa372e3fSPaul Mullowney 
939da79fbbcSStefano Zampini         /* assign the pointer */
940aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
941087f3262SPaul Mullowney 
942da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
94357d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
94457d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
945da79fbbcSStefano Zampini       } else {
946da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
947da79fbbcSStefano Zampini         offset = 0;
948da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
949da79fbbcSStefano Zampini           /* set the pointers */
950da79fbbcSStefano Zampini           v  = aa + ai[i];
951da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
952da79fbbcSStefano Zampini 
953da79fbbcSStefano Zampini           /* first, set the diagonal elements */
954da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
955da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
956da79fbbcSStefano Zampini 
957da79fbbcSStefano Zampini           offset+=1;
958da79fbbcSStefano Zampini           if (nz>0) {
959da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
960da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
961da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
962da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
963da79fbbcSStefano Zampini             }
964da79fbbcSStefano Zampini             offset+=nz;
965da79fbbcSStefano Zampini           }
966da79fbbcSStefano Zampini         }
967da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
968da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
969da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
970da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
971da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
972da79fbbcSStefano Zampini       }
97357d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
97457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
975087f3262SPaul Mullowney     } catch(char *ex) {
976087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
977087f3262SPaul Mullowney     }
978087f3262SPaul Mullowney   }
979087f3262SPaul Mullowney   PetscFunctionReturn(0);
980087f3262SPaul Mullowney }
981087f3262SPaul Mullowney 
982087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9839ae82921SPaul Mullowney {
9849ae82921SPaul Mullowney   PetscErrorCode               ierr;
985087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
986087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
987087f3262SPaul Mullowney   IS                           ip = a->row;
988087f3262SPaul Mullowney   PetscBool                    perm_identity;
989087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
990087f3262SPaul Mullowney 
991087f3262SPaul Mullowney   PetscFunctionBegin;
992da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
993087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
994da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
995aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
996aa372e3fSPaul Mullowney 
997da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
998da79fbbcSStefano Zampini 
999087f3262SPaul Mullowney   /* lower triangular indices */
1000087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1001087f3262SPaul Mullowney   if (!perm_identity) {
10024e4bbfaaSStefano Zampini     IS             iip;
1003da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
10044e4bbfaaSStefano Zampini 
10054e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
10064e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1007da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1008aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1009aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1010aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10114e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
10124e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
10134e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1014087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1015da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1016da79fbbcSStefano Zampini   }
1017087f3262SPaul Mullowney   PetscFunctionReturn(0);
1018087f3262SPaul Mullowney }
1019087f3262SPaul Mullowney 
1020087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1021087f3262SPaul Mullowney {
1022087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1023087f3262SPaul Mullowney   IS             ip = b->row;
1024087f3262SPaul Mullowney   PetscBool      perm_identity;
1025b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1026087f3262SPaul Mullowney 
1027087f3262SPaul Mullowney   PetscFunctionBegin;
102857181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1029087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1030ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1031087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1032087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1033087f3262SPaul Mullowney   if (perm_identity) {
1034087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1035087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10364e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10374e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1038087f3262SPaul Mullowney   } else {
1039087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1040087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10414e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10424e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1043087f3262SPaul Mullowney   }
1044087f3262SPaul Mullowney 
1045087f3262SPaul Mullowney   /* get the triangular factors */
1046087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1047087f3262SPaul Mullowney   PetscFunctionReturn(0);
1048087f3262SPaul Mullowney }
10499ae82921SPaul Mullowney 
1050b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1051bda325fcSPaul Mullowney {
1052bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1053aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1054aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1055da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1056da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1057bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1058aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1059aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1060aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1061aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10621b0a6780SStefano Zampini   cudaError_t                       cerr;
1063da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1064b175d8bbSPaul Mullowney 
1065bda325fcSPaul Mullowney   PetscFunctionBegin;
1066aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1067da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1068da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1071aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1072aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1073aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1074aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1075aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1076aa372e3fSPaul Mullowney 
1077aa372e3fSPaul Mullowney   /* Create the matrix description */
107857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
107957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
108057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
108157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
108257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1083aa372e3fSPaul Mullowney 
1084aa372e3fSPaul Mullowney   /* set the operation */
1085aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1086aa372e3fSPaul Mullowney 
1087aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1088aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1089afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1090afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1091aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1092afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1093afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1094afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1095aa372e3fSPaul Mullowney 
1096aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1097afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1098afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1099afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1100afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1101afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1102afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1103afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1104afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1105afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1106afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
11071b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1108afb2bd1cSJunchao Zhang #endif
1109afb2bd1cSJunchao Zhang 
1110da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1111aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1112aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1113aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1114aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1115aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1116aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1117afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1118afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1119afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1120d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1121afb2bd1cSJunchao Zhang                         #else
1122afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1123d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1124afb2bd1cSJunchao Zhang                         #endif
1125da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1126da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1127aa372e3fSPaul Mullowney 
1128afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1129da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1130afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11311b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1132afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1133afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1134afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1135afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1136afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1137afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1138afb2bd1cSJunchao Zhang #endif
1139afb2bd1cSJunchao Zhang 
1140afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1141aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1142afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1143afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1144d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
11451b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1146d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1147d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1148d49cd2b7SBarry Smith                           #else
1149d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1150afb2bd1cSJunchao Zhang                           #endif
1151da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1152da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1153aa372e3fSPaul Mullowney 
1154da79fbbcSStefano Zampini   /* assign the pointer */
1155aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1156aa372e3fSPaul Mullowney 
1157aa372e3fSPaul Mullowney   /*********************************************/
1158aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1159aa372e3fSPaul Mullowney   /*********************************************/
1160aa372e3fSPaul Mullowney 
1161aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1162da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1163da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1164aa372e3fSPaul Mullowney 
1165aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1166aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1167aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1168aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1169aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1170aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1171aa372e3fSPaul Mullowney 
1172aa372e3fSPaul Mullowney   /* Create the matrix description */
117357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
117457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
117557d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
117657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
117757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1178aa372e3fSPaul Mullowney 
1179aa372e3fSPaul Mullowney   /* set the operation */
1180aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1181aa372e3fSPaul Mullowney 
1182aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1183aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1184afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1185afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1186aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1187afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1188afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1189afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1190aa372e3fSPaul Mullowney 
1191aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1192afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1193afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1194afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1195afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1196afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1197afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1198afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1199afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1200afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1201afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1202afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1203afb2bd1cSJunchao Zhang #endif
1204afb2bd1cSJunchao Zhang 
1205da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1206aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1207aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1208aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1209aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1210aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1211aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1212afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1213afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1214afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1215d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1216afb2bd1cSJunchao Zhang                         #else
1217afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1218d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1219afb2bd1cSJunchao Zhang                         #endif
1220d49cd2b7SBarry Smith 
1221da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1222da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1223aa372e3fSPaul Mullowney 
1224afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1225da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1226afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
12271b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1228afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1229afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1230afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1231afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1232afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1233afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1234afb2bd1cSJunchao Zhang   #endif
1235afb2bd1cSJunchao Zhang 
1236afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1237aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1238afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1239afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1240d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
12411b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1242d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1243d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1244d49cd2b7SBarry Smith                           #else
1245d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1246afb2bd1cSJunchao Zhang                           #endif
1247d49cd2b7SBarry Smith 
1248da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1249da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1250aa372e3fSPaul Mullowney 
1251da79fbbcSStefano Zampini   /* assign the pointer */
1252aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1253bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1254bda325fcSPaul Mullowney }
1255bda325fcSPaul Mullowney 
1256a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1257a49f1ed0SStefano Zampini {
1258a49f1ed0SStefano Zampini   __host__ __device__
1259a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1260a49f1ed0SStefano Zampini   {
1261a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1262a49f1ed0SStefano Zampini   }
1263a49f1ed0SStefano Zampini };
1264a49f1ed0SStefano Zampini 
12653606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1266bda325fcSPaul Mullowney {
1267aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1268a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1269bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1270bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1271aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1272b06137fdSPaul Mullowney   cudaError_t                  err;
127385ba7357SStefano Zampini   PetscErrorCode               ierr;
1274b175d8bbSPaul Mullowney 
1275bda325fcSPaul Mullowney   PetscFunctionBegin;
1276a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1277a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1278e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1279a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1280e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12811a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
128285ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1283ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1284a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1285a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1286a49f1ed0SStefano Zampini   }
1287a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1288aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
128957d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1290aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
129157d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
129257d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1293aa372e3fSPaul Mullowney 
1294b06137fdSPaul Mullowney     /* set alpha and beta */
1295afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12967656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12977656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1298afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12997656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13007656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1301b06137fdSPaul Mullowney 
1302aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1303aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1304a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1305554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1306554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1307aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1308a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1309aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1310aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1311a3fdcf43SKarl Rupp 
1312039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
131381902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1314afb2bd1cSJunchao Zhang 
1315afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13163606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1317afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1318afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1319afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1320afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1321afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1322afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13233606e59fSJunchao Zhang       #else
13243606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
13253606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
13263606e59fSJunchao Zhang 
13273606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13283606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13293606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13303606e59fSJunchao Zhang         */
13313606e59fSJunchao Zhang         if (matrixT->num_entries) {
13323606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13333606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13343606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13353606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13363606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13373606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13383606e59fSJunchao Zhang 
13393606e59fSJunchao Zhang         } else {
13403606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13413606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13423606e59fSJunchao Zhang         }
13433606e59fSJunchao Zhang       #endif
1344afb2bd1cSJunchao Zhang      #endif
1345aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1346afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1347afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1348afb2bd1cSJunchao Zhang    #else
1349aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
135051c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
135151c6d536SStefano Zampini       /* First convert HYB to CSR */
1352aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1353aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1354aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1355aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1356aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1357aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1358aa372e3fSPaul Mullowney 
1359aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1360aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1361aa372e3fSPaul Mullowney                               temp->values->data().get(),
1362aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
136357d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1364aa372e3fSPaul Mullowney 
1365aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1366aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1367aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1368aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1369aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1370aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1371aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1372aa372e3fSPaul Mullowney 
1373aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1374aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1375aa372e3fSPaul Mullowney                               temp->values->data().get(),
1376aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1377aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1378aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1379aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1380aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
138157d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1382aa372e3fSPaul Mullowney 
1383aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1384aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
138557d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1386aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1387aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1388aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1389aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1390aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1391aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
139257d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1393aa372e3fSPaul Mullowney 
1394aa372e3fSPaul Mullowney       /* assign the pointer */
1395aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13961a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1397aa372e3fSPaul Mullowney       /* delete temporaries */
1398aa372e3fSPaul Mullowney       if (tempT) {
1399aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1400aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1401aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1402aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1403087f3262SPaul Mullowney       }
1404aa372e3fSPaul Mullowney       if (temp) {
1405aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1406aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1407aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1408aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1409aa372e3fSPaul Mullowney       }
1410afb2bd1cSJunchao Zhang      #endif
1411aa372e3fSPaul Mullowney     }
1412a49f1ed0SStefano Zampini   }
1413a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1414a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1415a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1416e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1417e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1418e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1419e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1420e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1421e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1422e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1423e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1424a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1425a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1426a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1427a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1428a49f1ed0SStefano Zampini     }
1429a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1430a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1431a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1432a49f1ed0SStefano Zampini 
1433a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1434a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1435a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1436a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1437a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1438a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1439a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1440a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1441a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1442a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1443a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1444a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1445a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1446a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1447a49f1ed0SStefano Zampini      #endif
1448a49f1ed0SStefano Zampini 
14491a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14501a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14511a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14521a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14531a2c6b5cSJunchao Zhang 
14541a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14551a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14561a2c6b5cSJunchao Zhang         */
14571a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14581a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14591a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14601a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14611a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1462a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1463a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1464a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1465a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14661a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1467a49f1ed0SStefano Zampini                              #else
1468a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14691a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1470a49f1ed0SStefano Zampini                              #endif
14711a2c6b5cSJunchao Zhang       } else {
14721a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14731a2c6b5cSJunchao Zhang       }
14741a2c6b5cSJunchao Zhang 
1475a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1476a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1477a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1478a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1479a49f1ed0SStefano Zampini      #endif
1480a49f1ed0SStefano Zampini     }
1481a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1482a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1483a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1484a49f1ed0SStefano Zampini   }
1485ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
148685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1487213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1488213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1489aa372e3fSPaul Mullowney   /* assign the pointer */
1490aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14911a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1492bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1493bda325fcSPaul Mullowney }
1494bda325fcSPaul Mullowney 
1495a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14966fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1497bda325fcSPaul Mullowney {
1498c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1499465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1500465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1501465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1502465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1503bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1504bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1505aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1506aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1507aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1508b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1509bda325fcSPaul Mullowney 
1510bda325fcSPaul Mullowney   PetscFunctionBegin;
1511aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1512aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1513bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1514aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1515aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1516bda325fcSPaul Mullowney   }
1517bda325fcSPaul Mullowney 
1518bda325fcSPaul Mullowney   /* Get the GPU pointers */
1519c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1520c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1521c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1522c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1523bda325fcSPaul Mullowney 
15247a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1525aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1526a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1527c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1528c41cb2e2SAlejandro Lamas Daviña                xGPU);
1529aa372e3fSPaul Mullowney 
1530aa372e3fSPaul Mullowney   /* First, solve U */
1531aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1532afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1535afb2bd1cSJunchao Zhang                       #endif
1536afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1537aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1538aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1539aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1540aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1541d49cd2b7SBarry Smith                         xarray,
15421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543d49cd2b7SBarry Smith                         tempGPU->data().get(),
1544d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1545d49cd2b7SBarry Smith                       #else
1546d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1547afb2bd1cSJunchao Zhang                       #endif
1548aa372e3fSPaul Mullowney 
1549aa372e3fSPaul Mullowney   /* Then, solve L */
1550aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1554afb2bd1cSJunchao Zhang                       #endif
1555afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1557aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1558aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1559aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1560d49cd2b7SBarry Smith                         tempGPU->data().get(),
15611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562d49cd2b7SBarry Smith                         xarray,
1563d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1564d49cd2b7SBarry Smith                       #else
1565d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1566afb2bd1cSJunchao Zhang                       #endif
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1569a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1570c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1571aa372e3fSPaul Mullowney                tempGPU->begin());
1572aa372e3fSPaul Mullowney 
1573aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1574a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1575bda325fcSPaul Mullowney 
1576bda325fcSPaul Mullowney   /* restore */
1577c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1578c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1579661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1580958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1581bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1582bda325fcSPaul Mullowney }
1583bda325fcSPaul Mullowney 
15846fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1585bda325fcSPaul Mullowney {
1586465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1587465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1588bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1589bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1590aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1591aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1592aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1593b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1594bda325fcSPaul Mullowney 
1595bda325fcSPaul Mullowney   PetscFunctionBegin;
1596aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1597aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1598bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1599aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1600aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1601bda325fcSPaul Mullowney   }
1602bda325fcSPaul Mullowney 
1603bda325fcSPaul Mullowney   /* Get the GPU pointers */
1604c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1605c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1606bda325fcSPaul Mullowney 
16077a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1608aa372e3fSPaul Mullowney   /* First, solve U */
1609aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1610afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
16111b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1613afb2bd1cSJunchao Zhang                       #endif
1614afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1615aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1616aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1617aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1618aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1619d49cd2b7SBarry Smith                         barray,
16201b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621d49cd2b7SBarry Smith                         tempGPU->data().get(),
1622d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1623d49cd2b7SBarry Smith                       #else
1624d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1625afb2bd1cSJunchao Zhang                       #endif
1626aa372e3fSPaul Mullowney 
1627aa372e3fSPaul Mullowney   /* Then, solve L */
1628aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1629afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16301b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1632afb2bd1cSJunchao Zhang                       #endif
1633afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1634aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1635aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1636aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1637aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1638d49cd2b7SBarry Smith                         tempGPU->data().get(),
16391b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1640d49cd2b7SBarry Smith                         xarray,
1641d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1642d49cd2b7SBarry Smith                       #else
1643d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1644afb2bd1cSJunchao Zhang                       #endif
1645bda325fcSPaul Mullowney 
1646bda325fcSPaul Mullowney   /* restore */
1647c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1648c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1649661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1650958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1651bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1652bda325fcSPaul Mullowney }
1653bda325fcSPaul Mullowney 
16546fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16559ae82921SPaul Mullowney {
1656465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1657465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1658465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1659465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16609ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1662aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1663aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1664aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1665b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16669ae82921SPaul Mullowney 
16679ae82921SPaul Mullowney   PetscFunctionBegin;
1668ebc8f436SDominic Meiser 
1669e057df02SPaul Mullowney   /* Get the GPU pointers */
1670c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1671c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1672c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1673c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16749ae82921SPaul Mullowney 
16757a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1676aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1677a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1678c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16794e4bbfaaSStefano Zampini                tempGPU->begin());
1680aa372e3fSPaul Mullowney 
1681aa372e3fSPaul Mullowney   /* Next, solve L */
1682aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1683afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1685afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1686afb2bd1cSJunchao Zhang                       #endif
1687afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1688aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1689aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1690aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1691aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1692d49cd2b7SBarry Smith                         tempGPU->data().get(),
16931b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694d49cd2b7SBarry Smith                          xarray,
1695d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1696d49cd2b7SBarry Smith                       #else
1697d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1698afb2bd1cSJunchao Zhang                       #endif
1699aa372e3fSPaul Mullowney 
1700aa372e3fSPaul Mullowney   /* Then, solve U */
1701aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1702afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17031b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1704afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1705afb2bd1cSJunchao Zhang                       #endif
1706afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1707aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1708aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1709aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1710d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
17111b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1712d49cd2b7SBarry Smith                         tempGPU->data().get(),
1713d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1714d49cd2b7SBarry Smith                       #else
1715d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1716afb2bd1cSJunchao Zhang                       #endif
1717d49cd2b7SBarry Smith 
17184e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1719a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
17204e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
17214e4bbfaaSStefano Zampini                xGPU);
17229ae82921SPaul Mullowney 
1723c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1724c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1725661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1726958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17279ae82921SPaul Mullowney   PetscFunctionReturn(0);
17289ae82921SPaul Mullowney }
17299ae82921SPaul Mullowney 
17306fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17319ae82921SPaul Mullowney {
1732465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1733465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17349ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17359ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1736aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1737aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1738aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1739b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
17409ae82921SPaul Mullowney 
17419ae82921SPaul Mullowney   PetscFunctionBegin;
1742e057df02SPaul Mullowney   /* Get the GPU pointers */
1743c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1744c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
17459ae82921SPaul Mullowney 
17467a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1747aa372e3fSPaul Mullowney   /* First, solve L */
1748aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1749afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17501b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1751afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1752afb2bd1cSJunchao Zhang                       #endif
1753afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1754aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1755aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1756aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1757aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1758d49cd2b7SBarry Smith                         barray,
17591b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1760d49cd2b7SBarry Smith                         tempGPU->data().get(),
1761d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1762d49cd2b7SBarry Smith                       #else
1763d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1764afb2bd1cSJunchao Zhang                       #endif
1765d49cd2b7SBarry Smith 
1766aa372e3fSPaul Mullowney   /* Next, solve U */
1767aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1768afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17691b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1770afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1771afb2bd1cSJunchao Zhang                       #endif
1772afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1773aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1774aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1775aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1776aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1777d49cd2b7SBarry Smith                         tempGPU->data().get(),
17781b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1779d49cd2b7SBarry Smith                         xarray,
1780d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1781d49cd2b7SBarry Smith                       #else
1782d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1783afb2bd1cSJunchao Zhang                       #endif
17849ae82921SPaul Mullowney 
1785c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1786c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1787661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1788958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17899ae82921SPaul Mullowney   PetscFunctionReturn(0);
17909ae82921SPaul Mullowney }
17919ae82921SPaul Mullowney 
17927e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17937e8381f9SStefano Zampini {
17947e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17957e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17967e8381f9SStefano Zampini   cudaError_t        cerr;
17977e8381f9SStefano Zampini   PetscErrorCode     ierr;
17987e8381f9SStefano Zampini 
17997e8381f9SStefano Zampini   PetscFunctionBegin;
18007e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
18017e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
18027e8381f9SStefano Zampini 
18037e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18047e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
18057e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
18067e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
18077e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18087e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
18097e8381f9SStefano Zampini   }
18107e8381f9SStefano Zampini   PetscFunctionReturn(0);
18117e8381f9SStefano Zampini }
18127e8381f9SStefano Zampini 
18137e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
18147e8381f9SStefano Zampini {
18157e8381f9SStefano Zampini   PetscErrorCode ierr;
18167e8381f9SStefano Zampini 
18177e8381f9SStefano Zampini   PetscFunctionBegin;
18187e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
181967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
182067a45760SJunchao Zhang   PetscFunctionReturn(0);
182167a45760SJunchao Zhang }
182267a45760SJunchao Zhang 
182367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
182467a45760SJunchao Zhang {
182567a45760SJunchao Zhang   PetscFunctionBegin;
18267e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
182767a45760SJunchao Zhang   *array         = NULL;
182867a45760SJunchao Zhang   PetscFunctionReturn(0);
182967a45760SJunchao Zhang }
183067a45760SJunchao Zhang 
183167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
183267a45760SJunchao Zhang {
183367a45760SJunchao Zhang   PetscErrorCode ierr;
183467a45760SJunchao Zhang 
183567a45760SJunchao Zhang   PetscFunctionBegin;
183667a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
183767a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
183867a45760SJunchao Zhang   PetscFunctionReturn(0);
183967a45760SJunchao Zhang }
184067a45760SJunchao Zhang 
184167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
184267a45760SJunchao Zhang {
184367a45760SJunchao Zhang   PetscFunctionBegin;
184467a45760SJunchao Zhang   *array = NULL;
184567a45760SJunchao Zhang   PetscFunctionReturn(0);
184667a45760SJunchao Zhang }
184767a45760SJunchao Zhang 
184867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
184967a45760SJunchao Zhang {
185067a45760SJunchao Zhang   PetscFunctionBegin;
185167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
185267a45760SJunchao Zhang   PetscFunctionReturn(0);
185367a45760SJunchao Zhang }
185467a45760SJunchao Zhang 
185567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185667a45760SJunchao Zhang {
185767a45760SJunchao Zhang   PetscFunctionBegin;
185867a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
185967a45760SJunchao Zhang   *array         = NULL;
18607e8381f9SStefano Zampini   PetscFunctionReturn(0);
18617e8381f9SStefano Zampini }
18627e8381f9SStefano Zampini 
1863042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18649ae82921SPaul Mullowney {
1865aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18667c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18679ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1868213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18699ae82921SPaul Mullowney   PetscErrorCode               ierr;
1870aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1871abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1872b06137fdSPaul Mullowney   cudaError_t                  err;
18739ae82921SPaul Mullowney 
18749ae82921SPaul Mullowney   PetscFunctionBegin;
1875e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1876c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1877a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1878a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1879afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
188085ba7357SStefano Zampini 
1881e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
188285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1883afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
188405035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18854863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
188685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1887a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
188834d6c7a5SJose E. Roman     } else {
1889abb89eb1SStefano Zampini       PetscInt nnz;
189085ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18917c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1892a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18937c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
189481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1895a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1896a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18979ae82921SPaul Mullowney       try {
18989ae82921SPaul Mullowney         if (a->compressedrow.use) {
18999ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
19009ae82921SPaul Mullowney           ii   = a->compressedrow.i;
19019ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
19029ae82921SPaul Mullowney         } else {
1903213423ffSJunchao Zhang           m    = A->rmap->n;
1904213423ffSJunchao Zhang           ii   = a->i;
1905e6e9a74fSStefano Zampini           ridx = NULL;
19069ae82921SPaul Mullowney         }
1907e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1908e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1909abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1910abb89eb1SStefano Zampini         else nnz = a->nz;
19119ae82921SPaul Mullowney 
191285ba7357SStefano Zampini         /* create cusparse matrix */
1913abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1914aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
191557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
191657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
191757d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
19189ae82921SPaul Mullowney 
1919afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
19207656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
19217656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1922afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19237656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19247656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
192557d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1926b06137fdSPaul Mullowney 
1927aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1928aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1929aa372e3fSPaul Mullowney           /* set the matrix */
1930afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1931afb2bd1cSJunchao Zhang           mat->num_rows = m;
1932afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1933abb89eb1SStefano Zampini           mat->num_entries = nnz;
1934afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1935afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
19369ae82921SPaul Mullowney 
1937abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1938abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1939aa372e3fSPaul Mullowney 
1940abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1941abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1942aa372e3fSPaul Mullowney 
1943aa372e3fSPaul Mullowney           /* assign the pointer */
1944afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1945afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1946afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1947afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1948afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1949afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1950afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1951afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1952afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1953afb2bd1cSJunchao Zhang           }
1954afb2bd1cSJunchao Zhang          #endif
1955aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1956afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1957afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1958afb2bd1cSJunchao Zhang          #else
1959afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1960afb2bd1cSJunchao Zhang           mat->num_rows = m;
1961afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1962abb89eb1SStefano Zampini           mat->num_entries = nnz;
1963afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1964afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1965aa372e3fSPaul Mullowney 
1966abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1967abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1968aa372e3fSPaul Mullowney 
1969abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1970abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1971aa372e3fSPaul Mullowney 
1972aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
197357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1974aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1975aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1976afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1977afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1978afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1979afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
198057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1981aa372e3fSPaul Mullowney           /* assign the pointer */
1982aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1983aa372e3fSPaul Mullowney 
1984afb2bd1cSJunchao Zhang           if (mat) {
1985afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1986afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1987afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1988afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1989087f3262SPaul Mullowney           }
1990afb2bd1cSJunchao Zhang          #endif
1991087f3262SPaul Mullowney         }
1992ca45077fSPaul Mullowney 
1993aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1994213423ffSJunchao Zhang         if (a->compressedrow.use) {
1995213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1996aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1997aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1998213423ffSJunchao Zhang           tmp = m;
1999213423ffSJunchao Zhang         } else {
2000213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2001213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2002213423ffSJunchao Zhang           tmp = 0;
2003213423ffSJunchao Zhang         }
2004213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2005aa372e3fSPaul Mullowney 
2006aa372e3fSPaul Mullowney         /* assign the pointer */
2007aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
20089ae82921SPaul Mullowney       } catch(char *ex) {
20099ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
20109ae82921SPaul Mullowney       }
201105035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
201285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
201334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
201434d6c7a5SJose E. Roman     }
2015abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
20169ae82921SPaul Mullowney   }
20179ae82921SPaul Mullowney   PetscFunctionReturn(0);
20189ae82921SPaul Mullowney }
20199ae82921SPaul Mullowney 
2020c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2021aa372e3fSPaul Mullowney {
2022aa372e3fSPaul Mullowney   template <typename Tuple>
2023aa372e3fSPaul Mullowney   __host__ __device__
2024aa372e3fSPaul Mullowney   void operator()(Tuple t)
2025aa372e3fSPaul Mullowney   {
2026aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2027aa372e3fSPaul Mullowney   }
2028aa372e3fSPaul Mullowney };
2029aa372e3fSPaul Mullowney 
20307e8381f9SStefano Zampini struct VecCUDAEquals
20317e8381f9SStefano Zampini {
20327e8381f9SStefano Zampini   template <typename Tuple>
20337e8381f9SStefano Zampini   __host__ __device__
20347e8381f9SStefano Zampini   void operator()(Tuple t)
20357e8381f9SStefano Zampini   {
20367e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
20377e8381f9SStefano Zampini   }
20387e8381f9SStefano Zampini };
20397e8381f9SStefano Zampini 
2040e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2041e6e9a74fSStefano Zampini {
2042e6e9a74fSStefano Zampini   template <typename Tuple>
2043e6e9a74fSStefano Zampini   __host__ __device__
2044e6e9a74fSStefano Zampini   void operator()(Tuple t)
2045e6e9a74fSStefano Zampini   {
2046e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2047e6e9a74fSStefano Zampini   }
2048e6e9a74fSStefano Zampini };
2049e6e9a74fSStefano Zampini 
2050afb2bd1cSJunchao Zhang struct MatMatCusparse {
2051ccdfe979SStefano Zampini   PetscBool             cisdense;
2052ccdfe979SStefano Zampini   PetscScalar           *Bt;
2053ccdfe979SStefano Zampini   Mat                   X;
2054fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2055fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2056fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2057b4285af6SJunchao Zhang 
2058afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2059fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2060afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2061afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2062afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2063afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2064b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2065b4285af6SJunchao Zhang   void                  *dBuffer4;
2066b4285af6SJunchao Zhang   void                  *dBuffer5;
2067b4285af6SJunchao Zhang  #endif
2068fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2069fcdce8c4SStefano Zampini   void                  *mmBuffer;
2070fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2071fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2072afb2bd1cSJunchao Zhang #endif
2073afb2bd1cSJunchao Zhang };
2074ccdfe979SStefano Zampini 
2075ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2076ccdfe979SStefano Zampini {
2077ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2078ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2079ccdfe979SStefano Zampini   cudaError_t      cerr;
2080fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2081fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2082fcdce8c4SStefano Zampini  #endif
2083ccdfe979SStefano Zampini 
2084ccdfe979SStefano Zampini   PetscFunctionBegin;
2085ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2086fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2087afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2089afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2090afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2091fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2092b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2093b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2094b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2095b4285af6SJunchao Zhang  #endif
2096b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2097b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2098afb2bd1cSJunchao Zhang  #endif
2099ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2100ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2101ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2102ccdfe979SStefano Zampini }
2103ccdfe979SStefano Zampini 
2104ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2105ccdfe979SStefano Zampini 
2106ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2107ccdfe979SStefano Zampini {
2108ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2109ccdfe979SStefano Zampini   Mat                          A,B;
2110afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2111ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2112ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2113ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2114ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2115ccdfe979SStefano Zampini   const PetscScalar            *barray;
2116ccdfe979SStefano Zampini   PetscScalar                  *carray;
2117ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2118ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2119ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2120ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2121ccdfe979SStefano Zampini 
2122ccdfe979SStefano Zampini   PetscFunctionBegin;
2123ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2124e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2125ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2126ccdfe979SStefano Zampini   A    = product->A;
2127ccdfe979SStefano Zampini   B    = product->B;
2128ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2129e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2130ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2131ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2132ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2133ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2134ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2135ccdfe979SStefano Zampini   switch (product->type) {
2136ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2137ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2138ccdfe979SStefano Zampini     mat = cusp->mat;
2139ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2140ccdfe979SStefano Zampini     m   = A->rmap->n;
2141ccdfe979SStefano Zampini     n   = B->cmap->n;
2142ccdfe979SStefano Zampini     break;
2143ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
21441a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2145e6e9a74fSStefano Zampini       mat = cusp->mat;
2146e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2147e6e9a74fSStefano Zampini     } else {
21483606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2149ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2150ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2151e6e9a74fSStefano Zampini     }
2152ccdfe979SStefano Zampini     m = A->cmap->n;
2153ccdfe979SStefano Zampini     n = B->cmap->n;
2154ccdfe979SStefano Zampini     break;
2155ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2156ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2157ccdfe979SStefano Zampini     mat = cusp->mat;
2158ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2159ccdfe979SStefano Zampini     m   = A->rmap->n;
2160ccdfe979SStefano Zampini     n   = B->rmap->n;
2161ccdfe979SStefano Zampini     break;
2162ccdfe979SStefano Zampini   default:
2163e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2164ccdfe979SStefano Zampini   }
2165e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2166ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2167ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2168ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2169afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2170ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2171afb2bd1cSJunchao Zhang 
2172ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2173c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2174c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2175c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2176c8378d12SStefano Zampini   } else {
2177c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2178c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2179c8378d12SStefano Zampini   }
2180c8378d12SStefano Zampini 
2181c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2182afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2183afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2184a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2185afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2186fcdce8c4SStefano Zampini     size_t mmBufferSize;
2187afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2188afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2189afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2190afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2191afb2bd1cSJunchao Zhang     }
2192c8378d12SStefano Zampini 
2193afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2194afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2195afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2196afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2197afb2bd1cSJunchao Zhang     }
2198afb2bd1cSJunchao Zhang 
2199afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2200afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2201afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2202afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2203afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2204afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2205afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2206afb2bd1cSJunchao Zhang     }
2207afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2208afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2209afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2210fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2211fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2212ee7b52eaSHong Zhang       cudaError_t cerr;
2213fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2214fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2215fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2216fcdce8c4SStefano Zampini     }
2217afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2218afb2bd1cSJunchao Zhang   } else {
2219afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2220afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2221afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2222afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2223afb2bd1cSJunchao Zhang   }
2224afb2bd1cSJunchao Zhang 
2225afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2226afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2227afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2228afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2229fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2230afb2bd1cSJunchao Zhang  #else
2231afb2bd1cSJunchao Zhang   PetscInt k;
2232afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2233ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2234ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2235ccdfe979SStefano Zampini     cublasStatus_t cerr;
2236ccdfe979SStefano Zampini 
2237ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2238ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2239ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2240ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2241ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2242ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2243ccdfe979SStefano Zampini     blda = B->cmap->n;
2244afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2245afb2bd1cSJunchao Zhang   } else {
2246afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2247ccdfe979SStefano Zampini   }
2248ccdfe979SStefano Zampini 
2249afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2250ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2251afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2252ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2253ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2254ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2255ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2256ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2257afb2bd1cSJunchao Zhang  #endif
2258c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2259c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2260ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2261ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2262ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2263ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2264ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2265ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2266ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2267ccdfe979SStefano Zampini   } else {
2268ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2269ccdfe979SStefano Zampini   }
2270ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2271ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2272ccdfe979SStefano Zampini   }
2273ccdfe979SStefano Zampini   if (!biscuda) {
2274ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2275ccdfe979SStefano Zampini   }
2276ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2277ccdfe979SStefano Zampini }
2278ccdfe979SStefano Zampini 
2279ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2280ccdfe979SStefano Zampini {
2281ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2282ccdfe979SStefano Zampini   Mat                A,B;
2283ccdfe979SStefano Zampini   PetscInt           m,n;
2284ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2285ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2286ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2287ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2288ccdfe979SStefano Zampini 
2289ccdfe979SStefano Zampini   PetscFunctionBegin;
2290ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2291e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2292ccdfe979SStefano Zampini   A    = product->A;
2293ccdfe979SStefano Zampini   B    = product->B;
2294ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2295e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2296ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2297e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2298ccdfe979SStefano Zampini   switch (product->type) {
2299ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2300ccdfe979SStefano Zampini     m = A->rmap->n;
2301ccdfe979SStefano Zampini     n = B->cmap->n;
2302ccdfe979SStefano Zampini     break;
2303ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2304ccdfe979SStefano Zampini     m = A->cmap->n;
2305ccdfe979SStefano Zampini     n = B->cmap->n;
2306ccdfe979SStefano Zampini     break;
2307ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2308ccdfe979SStefano Zampini     m = A->rmap->n;
2309ccdfe979SStefano Zampini     n = B->rmap->n;
2310ccdfe979SStefano Zampini     break;
2311ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2312ccdfe979SStefano Zampini     m = B->cmap->n;
2313ccdfe979SStefano Zampini     n = B->cmap->n;
2314ccdfe979SStefano Zampini     break;
2315ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2316ccdfe979SStefano Zampini     m = B->rmap->n;
2317ccdfe979SStefano Zampini     n = B->rmap->n;
2318ccdfe979SStefano Zampini     break;
2319ccdfe979SStefano Zampini   default:
2320e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2321ccdfe979SStefano Zampini   }
2322ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2323ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2324ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2325ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2326ccdfe979SStefano Zampini 
2327ccdfe979SStefano Zampini   /* product data */
2328ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2329ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2330afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2331afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2332ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2333afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2334ccdfe979SStefano Zampini   }
2335afb2bd1cSJunchao Zhang  #endif
2336ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2337ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2338ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2339ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2340ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2341ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2342ccdfe979SStefano Zampini     } else {
2343ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2344ccdfe979SStefano Zampini     }
2345ccdfe979SStefano Zampini   }
2346ccdfe979SStefano Zampini   C->product->data    = mmdata;
2347ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2348ccdfe979SStefano Zampini 
2349ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2350ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2351ccdfe979SStefano Zampini }
2352ccdfe979SStefano Zampini 
2353fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2354ccdfe979SStefano Zampini {
2355ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2356fcdce8c4SStefano Zampini   Mat                          A,B;
2357fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2358fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2359fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2360fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2361fcdce8c4SStefano Zampini   PetscBool                    flg;
2362ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2363fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2364fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2365fcdce8c4SStefano Zampini   MatProductType               ptype;
2366fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2367fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2368fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2369fcdce8c4SStefano Zampini #endif
2370b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2371ccdfe979SStefano Zampini 
2372ccdfe979SStefano Zampini   PetscFunctionBegin;
2373ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2374e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2375fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2376e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2377fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2378fcdce8c4SStefano Zampini   A = product->A;
2379fcdce8c4SStefano Zampini   B = product->B;
2380fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2381fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2382fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2383e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2384fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2385e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2386fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2387e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2388fcdce8c4SStefano Zampini     goto finalize;
2389fcdce8c4SStefano Zampini   }
2390fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2391fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2392e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2393fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2395fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2396fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2397fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2398fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2399fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2400e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2401e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2404fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2405fcdce8c4SStefano Zampini 
2406fcdce8c4SStefano Zampini   ptype = product->type;
2407fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2408fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2409fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2410fa046f9fSJunchao Zhang   }
2411fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2412fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2413fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2414fa046f9fSJunchao Zhang   }
2415fcdce8c4SStefano Zampini   switch (ptype) {
2416fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2417fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2418fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2419fcdce8c4SStefano Zampini     break;
2420fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2421fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2422fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2423fcdce8c4SStefano Zampini     break;
2424fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2425fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2426fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2427fcdce8c4SStefano Zampini     break;
2428fcdce8c4SStefano Zampini   default:
2429e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2430fcdce8c4SStefano Zampini   }
2431fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2432e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2433e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2434e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2435fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2436fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2437fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2438e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2439e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2440e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2441fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2442fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2443fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2444b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2445b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2446b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2447b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2448b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2449b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2450b4285af6SJunchao Zhang   #else
2451b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2452fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2455b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2456fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2457fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2458b4285af6SJunchao Zhang   #endif
2459fcdce8c4SStefano Zampini #else
2460b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2461fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2462fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2463fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2464fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2465fcdce8c4SStefano Zampini #endif
2466fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2467fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2468fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2469fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2470fcdce8c4SStefano Zampini finalize:
2471fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2472fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2473fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2474fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2475fcdce8c4SStefano Zampini   c->reallocs         = 0;
2476fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2477fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2478fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2479fcdce8c4SStefano Zampini   C->num_ass++;
2480ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2481ccdfe979SStefano Zampini }
2482fcdce8c4SStefano Zampini 
2483fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2484fcdce8c4SStefano Zampini {
2485fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2486fcdce8c4SStefano Zampini   Mat                          A,B;
2487fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2488fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2489fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2490fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2491fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2492fcdce8c4SStefano Zampini   PetscBool                    flg;
2493fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2494fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2495fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2496fcdce8c4SStefano Zampini   MatProductType               ptype;
2497fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2498fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2499fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2500fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2501fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2502fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2503fcdce8c4SStefano Zampini #else
2504fcdce8c4SStefano Zampini   int                          cnz;
2505fcdce8c4SStefano Zampini #endif
2506b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2507fcdce8c4SStefano Zampini 
2508fcdce8c4SStefano Zampini   PetscFunctionBegin;
2509fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2510e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2511fcdce8c4SStefano Zampini   A    = product->A;
2512fcdce8c4SStefano Zampini   B    = product->B;
2513fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2514e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2515fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2516e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2517fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2518fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2519fcdce8c4SStefano Zampini   /* product data */
2520fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2521fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2522fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2523fcdce8c4SStefano Zampini 
2524fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2525fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2526d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2527d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2528d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2529d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2530d60bce21SJunchao Zhang 
2531fcdce8c4SStefano Zampini   ptype = product->type;
2532fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2533fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2534fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2535fa046f9fSJunchao Zhang   }
2536fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2537fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2538fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2539fa046f9fSJunchao Zhang   }
2540fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2541fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2542fcdce8c4SStefano Zampini   switch (ptype) {
2543fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2544fcdce8c4SStefano Zampini     m = A->rmap->n;
2545fcdce8c4SStefano Zampini     n = B->cmap->n;
2546fcdce8c4SStefano Zampini     k = A->cmap->n;
2547fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2548fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2549fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2550fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2551fcdce8c4SStefano Zampini     break;
2552fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2553fcdce8c4SStefano Zampini     m = A->cmap->n;
2554fcdce8c4SStefano Zampini     n = B->cmap->n;
2555fcdce8c4SStefano Zampini     k = A->rmap->n;
25563606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2557fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2558fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2559fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2560fcdce8c4SStefano Zampini     break;
2561fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2562fcdce8c4SStefano Zampini     m = A->rmap->n;
2563fcdce8c4SStefano Zampini     n = B->rmap->n;
2564fcdce8c4SStefano Zampini     k = A->cmap->n;
25653606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2566fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2567fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2568fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2569fcdce8c4SStefano Zampini     break;
2570fcdce8c4SStefano Zampini   default:
2571e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2572fcdce8c4SStefano Zampini   }
2573fcdce8c4SStefano Zampini 
2574fcdce8c4SStefano Zampini   /* create cusparse matrix */
2575fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2576fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2577fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2578fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2579fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2580fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2581fcdce8c4SStefano Zampini 
2582fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2583fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2584fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2585fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2586fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2587fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2588fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2589fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2590fcdce8c4SStefano Zampini   } else {
2591fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2592fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2593fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2594fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2595fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2596fcdce8c4SStefano Zampini   }
2597fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2598fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2599fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2600fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2601fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2602fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2603fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2604fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2605fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2606fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2607fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2608fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2609fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2610fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2611fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2613fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2614fcdce8c4SStefano Zampini     c->nz = 0;
2615fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2616fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2617fcdce8c4SStefano Zampini     goto finalizesym;
2618fcdce8c4SStefano Zampini   }
2619fcdce8c4SStefano Zampini 
2620e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2621e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2622fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2623fcdce8c4SStefano Zampini   if (!biscompressed) {
2624fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2625fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2626fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2627fcdce8c4SStefano Zampini #endif
2628fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2629fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2630fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2631fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2632fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2633fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2634fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2635fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2636fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2637fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2638fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2639fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2640fcdce8c4SStefano Zampini     }
2641fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2642fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2643fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2644fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2645fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2646fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2647fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2648fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2649fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2650fcdce8c4SStefano Zampini     }
2651fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2652fcdce8c4SStefano Zampini #endif
2653fcdce8c4SStefano Zampini   }
2654e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2655e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2656fcdce8c4SStefano Zampini   /* precompute flops count */
2657fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2658fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2659fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2660fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2661fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2662fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2663fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2664fcdce8c4SStefano Zampini       }
2665fcdce8c4SStefano Zampini     }
2666fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2667fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2668fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2669fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2670fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2671fcdce8c4SStefano Zampini     }
2672fcdce8c4SStefano Zampini   } else { /* TODO */
2673fcdce8c4SStefano Zampini     flops = 0.;
2674fcdce8c4SStefano Zampini   }
2675fcdce8c4SStefano Zampini 
2676fcdce8c4SStefano Zampini   mmdata->flops = flops;
2677fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2678b4285af6SJunchao Zhang 
2679fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2680fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2681fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2682fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2683fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2684fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2685fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2686b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2687b4285af6SJunchao Zhang  {
2688b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2689b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2690b4285af6SJunchao Zhang   */
2691b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2692b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2693b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2694b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2695b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2696b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2697b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2698b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2699b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2700b4285af6SJunchao Zhang 
2701b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2702b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2703b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2704b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2705b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2706b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2707b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2711b4285af6SJunchao Zhang 
2712b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2713b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2716b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2717b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2718b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2719b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2720b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2721b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2722b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2723b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2724b4285af6SJunchao Zhang 
2725b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2726b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2727b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2728b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2729b4285af6SJunchao Zhang   /* allocate matrix C */
2730b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2731b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2732b4285af6SJunchao Zhang   /* update matC with the new pointers */
2733b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2734b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2735b4285af6SJunchao Zhang 
2736b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2737b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2738b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2739b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2740b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2741b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2742b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2743b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2744b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2745b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2746b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2747b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2748b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2749b4285af6SJunchao Zhang   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2750b4285af6SJunchao Zhang  }
2751b4285af6SJunchao Zhang  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2752b4285af6SJunchao Zhang   size_t bufSize2;
2753fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2754b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2755fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2756fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2757fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2758bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2759fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2760b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2761fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2762fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2763fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2764fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2765b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2766fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2769fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2770fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2771fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2772fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2773fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2774bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2775fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2776b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2777fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2778fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2779fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2780fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2781fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2782fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
278300702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2784fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2785fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2786fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2787fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2788fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2789fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2790b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2791fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2792fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2793b4285af6SJunchao Zhang  #endif
2794fcdce8c4SStefano Zampini #else
2795fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2796b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2797fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2798fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2799fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2800fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2801fcdce8c4SStefano Zampini   c->nz = cnz;
2802fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2803fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2804fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2805fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2806fcdce8c4SStefano Zampini 
2807fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2808fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2809fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2810fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2811b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2812fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2813fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2814fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2815fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2816fcdce8c4SStefano Zampini #endif
2817fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2818fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2819fcdce8c4SStefano Zampini finalizesym:
2820fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2821fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2822fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2823fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2824fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2825fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2826fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2827fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2828fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2829fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2830fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2831fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2832fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2833fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2834fcdce8c4SStefano Zampini   } else {
2835fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2836fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2837fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839fcdce8c4SStefano Zampini   }
2840fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2841fcdce8c4SStefano Zampini     PetscInt r = 0;
2842fcdce8c4SStefano Zampini     c->i[0] = 0;
2843fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2844fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2845fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2846fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2847fcdce8c4SStefano Zampini     }
2848fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2849fcdce8c4SStefano Zampini   }
2850fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2851fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2852fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2853fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2854fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2855fcdce8c4SStefano Zampini   c->rmax = 0;
2856fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2857fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2858fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2859fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2860fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2861fcdce8c4SStefano Zampini   }
2862fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2863fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2864fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2865fcdce8c4SStefano Zampini 
2866fcdce8c4SStefano Zampini   C->nonzerostate++;
2867fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2868fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2869fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2870fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2871fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2872fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2873fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2874abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2875fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2876fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2877fcdce8c4SStefano Zampini   }
2878fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2879fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2880fcdce8c4SStefano Zampini }
2881fcdce8c4SStefano Zampini 
2882fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2883fcdce8c4SStefano Zampini 
2884fcdce8c4SStefano Zampini /* handles sparse or dense B */
2885fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2886fcdce8c4SStefano Zampini {
2887fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2888fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2889fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2890fcdce8c4SStefano Zampini 
2891fcdce8c4SStefano Zampini   PetscFunctionBegin;
2892fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2893fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2894abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2895fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2896fcdce8c4SStefano Zampini   }
2897fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2898fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2899fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2900fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2901fcdce8c4SStefano Zampini     }
2902fcdce8c4SStefano Zampini   }
290365e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
290465e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
290565e4b4d4SStefano Zampini     switch (product->type) {
290665e4b4d4SStefano Zampini     case MATPRODUCT_AB:
290765e4b4d4SStefano Zampini       if (product->api_user) {
290865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
290965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291165e4b4d4SStefano Zampini       } else {
291265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
291365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291565e4b4d4SStefano Zampini       }
291665e4b4d4SStefano Zampini       break;
291765e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
291865e4b4d4SStefano Zampini       if (product->api_user) {
291965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
292065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292265e4b4d4SStefano Zampini       } else {
292365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
292465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292665e4b4d4SStefano Zampini       }
292765e4b4d4SStefano Zampini       break;
292865e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
292965e4b4d4SStefano Zampini       if (product->api_user) {
293065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
293165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293365e4b4d4SStefano Zampini       } else {
293465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
293565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293765e4b4d4SStefano Zampini       }
293865e4b4d4SStefano Zampini       break;
293965e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
294065e4b4d4SStefano Zampini       if (product->api_user) {
294165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
294265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294465e4b4d4SStefano Zampini       } else {
294565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
294665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294865e4b4d4SStefano Zampini       }
294965e4b4d4SStefano Zampini       break;
295065e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
295165e4b4d4SStefano Zampini       if (product->api_user) {
295265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
295365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295565e4b4d4SStefano Zampini       } else {
295665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
295765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295965e4b4d4SStefano Zampini       }
296065e4b4d4SStefano Zampini       break;
296165e4b4d4SStefano Zampini     default:
296265e4b4d4SStefano Zampini       break;
296365e4b4d4SStefano Zampini     }
296465e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
296565e4b4d4SStefano Zampini   }
296665e4b4d4SStefano Zampini   /* dispatch */
2967fcdce8c4SStefano Zampini   if (isdense) {
2968ccdfe979SStefano Zampini     switch (product->type) {
2969ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2970ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2971ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2972ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2973ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2974fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2975fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2976fcdce8c4SStefano Zampini       } else {
2977fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2978fcdce8c4SStefano Zampini       }
2979fcdce8c4SStefano Zampini       break;
2980fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2981fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2982fcdce8c4SStefano Zampini       break;
2983ccdfe979SStefano Zampini     default:
2984ccdfe979SStefano Zampini       break;
2985ccdfe979SStefano Zampini     }
2986fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2987fcdce8c4SStefano Zampini     switch (product->type) {
2988fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2989fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2990fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2991fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2992fcdce8c4SStefano Zampini       break;
2993fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2994fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2995fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2996fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2997fcdce8c4SStefano Zampini       break;
2998fcdce8c4SStefano Zampini     default:
2999fcdce8c4SStefano Zampini       break;
3000fcdce8c4SStefano Zampini     }
3001fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
3002fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3003fcdce8c4SStefano Zampini   }
3004ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3005ccdfe979SStefano Zampini }
3006ccdfe979SStefano Zampini 
30076fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
30089ae82921SPaul Mullowney {
3009b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30109ae82921SPaul Mullowney 
30119ae82921SPaul Mullowney   PetscFunctionBegin;
3012e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3013e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3014e6e9a74fSStefano Zampini }
3015e6e9a74fSStefano Zampini 
3016e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3017e6e9a74fSStefano Zampini {
3018e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3019e6e9a74fSStefano Zampini 
3020e6e9a74fSStefano Zampini   PetscFunctionBegin;
3021e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3022e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3023e6e9a74fSStefano Zampini }
3024e6e9a74fSStefano Zampini 
3025e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3026e6e9a74fSStefano Zampini {
3027e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3028e6e9a74fSStefano Zampini 
3029e6e9a74fSStefano Zampini   PetscFunctionBegin;
3030e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3031e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3032e6e9a74fSStefano Zampini }
3033e6e9a74fSStefano Zampini 
3034e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3035e6e9a74fSStefano Zampini {
3036e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3037e6e9a74fSStefano Zampini 
3038e6e9a74fSStefano Zampini   PetscFunctionBegin;
3039e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
30409ae82921SPaul Mullowney   PetscFunctionReturn(0);
30419ae82921SPaul Mullowney }
30429ae82921SPaul Mullowney 
30436fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3044ca45077fSPaul Mullowney {
3045b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3046ca45077fSPaul Mullowney 
3047ca45077fSPaul Mullowney   PetscFunctionBegin;
3048e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3049ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3050ca45077fSPaul Mullowney }
3051ca45077fSPaul Mullowney 
3052a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3053a0e72f99SJunchao Zhang {
3054a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3055a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3056a0e72f99SJunchao Zhang }
3057a0e72f99SJunchao Zhang 
3058afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3059e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30609ae82921SPaul Mullowney {
30619ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3062aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30639ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3064e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3065b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3066aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3067e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3068e6e9a74fSStefano Zampini   PetscBool                    compressed;
3069afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3070afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3071afb2bd1cSJunchao Zhang #endif
30726e111a19SKarl Rupp 
30739ae82921SPaul Mullowney   PetscFunctionBegin;
3074e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3075e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3076afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3077d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3078e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3079e6e9a74fSStefano Zampini   }
308034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
308134d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3082e6e9a74fSStefano Zampini   if (!trans) {
30839ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3084e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3085e6e9a74fSStefano Zampini   } else {
30861a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3087e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3088e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089e6e9a74fSStefano Zampini     } else {
30903606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3091e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3092e6e9a74fSStefano Zampini     }
3093e6e9a74fSStefano Zampini   }
3094e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3095e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3096213423ffSJunchao Zhang 
3097e6e9a74fSStefano Zampini   try {
3098e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3099213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3100213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3101afb2bd1cSJunchao Zhang 
310285ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3103e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3104afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3105afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3106afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3107afb2bd1cSJunchao Zhang       */
3108e6e9a74fSStefano Zampini       xptr = xarray;
3109afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3110213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3111afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3112afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3113afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3114afb2bd1cSJunchao Zhang        */
3115afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3116afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3117afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3118afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3119afb2bd1cSJunchao Zhang       }
3120afb2bd1cSJunchao Zhang      #endif
3121e6e9a74fSStefano Zampini     } else {
3122afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3123afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3124afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3125afb2bd1cSJunchao Zhang        */
3126afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3127e6e9a74fSStefano Zampini       dptr = zarray;
3128e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3129afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3130e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3131a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3132e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3133e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3134e6e9a74fSStefano Zampini       }
3135afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3136afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3137afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3138afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3139afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3140afb2bd1cSJunchao Zhang       }
3141afb2bd1cSJunchao Zhang      #endif
3142e6e9a74fSStefano Zampini     }
31439ae82921SPaul Mullowney 
3144afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3145aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3146afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3147afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3148afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3149ee7b52eaSHong Zhang         cudaError_t cerr;
3150afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3151afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3152afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3153afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3154afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3155afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3156afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3157afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3158afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3159afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3160afb2bd1cSJunchao Zhang 
3161afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3162afb2bd1cSJunchao Zhang       } else {
3163afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3164afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3165afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3166afb2bd1cSJunchao Zhang       }
3167afb2bd1cSJunchao Zhang 
3168afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3169afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31703606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3171afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3172afb2bd1cSJunchao Zhang                                beta,
3173afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3174afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3175afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3176afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3177afb2bd1cSJunchao Zhang      #else
31787656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3179e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3180a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3181afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3182aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3183e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
318457d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3185afb2bd1cSJunchao Zhang      #endif
3186aa372e3fSPaul Mullowney     } else {
3187213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3188afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3189afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3190afb2bd1cSJunchao Zhang        #else
3191301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3192e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3193afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3194e6e9a74fSStefano Zampini                                  xptr, beta,
319557d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3196afb2bd1cSJunchao Zhang        #endif
3197a65300a6SPaul Mullowney       }
3198aa372e3fSPaul Mullowney     }
3199958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3200aa372e3fSPaul Mullowney 
3201e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3202213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3203213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3204213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3205e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3206213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
32077656d835SStefano Zampini         }
3208213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3209c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
32107656d835SStefano Zampini       }
32117656d835SStefano Zampini 
3212213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3213213423ffSJunchao Zhang       if (compressed) {
3214e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3215a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3216a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3217a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3218a0e72f99SJunchao Zhang          */
3219a0e72f99SJunchao Zhang        #if 0
3220a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3221a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3222a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3223e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3224c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3225a0e72f99SJunchao Zhang        #else
3226a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3227a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3228a0e72f99SJunchao Zhang        #endif
3229958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3230e6e9a74fSStefano Zampini       }
3231e6e9a74fSStefano Zampini     } else {
3232e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3233e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3234e6e9a74fSStefano Zampini       }
3235e6e9a74fSStefano Zampini     }
3236e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3237213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3238213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
32399ae82921SPaul Mullowney   } catch(char *ex) {
32409ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
32419ae82921SPaul Mullowney   }
3242e6e9a74fSStefano Zampini   if (yy) {
3243958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3244e6e9a74fSStefano Zampini   } else {
3245e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3246e6e9a74fSStefano Zampini   }
32479ae82921SPaul Mullowney   PetscFunctionReturn(0);
32489ae82921SPaul Mullowney }
32499ae82921SPaul Mullowney 
32506fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3251ca45077fSPaul Mullowney {
3252b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32536e111a19SKarl Rupp 
3254ca45077fSPaul Mullowney   PetscFunctionBegin;
3255e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3256ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3257ca45077fSPaul Mullowney }
3258ca45077fSPaul Mullowney 
32596fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32609ae82921SPaul Mullowney {
32619ae82921SPaul Mullowney   PetscErrorCode     ierr;
3262042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3263042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32643fa6b06aSMark Adams 
3265042217e8SBarry Smith   PetscFunctionBegin;
3266042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3267042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3268042217e8SBarry Smith     cudaError_t cerr;
3269042217e8SBarry Smith 
3270042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3271042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3272042217e8SBarry Smith     cusp->deviceMat = NULL;
3273042217e8SBarry Smith   }
32749ae82921SPaul Mullowney   PetscFunctionReturn(0);
32759ae82921SPaul Mullowney }
32769ae82921SPaul Mullowney 
32779ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3278e057df02SPaul Mullowney /*@
32799ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3280e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3281e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3282e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3283e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3284e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32859ae82921SPaul Mullowney 
3286d083f849SBarry Smith    Collective
32879ae82921SPaul Mullowney 
32889ae82921SPaul Mullowney    Input Parameters:
32899ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32909ae82921SPaul Mullowney .  m - number of rows
32919ae82921SPaul Mullowney .  n - number of columns
32929ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32939ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32940298fd71SBarry Smith          (possibly different for each row) or NULL
32959ae82921SPaul Mullowney 
32969ae82921SPaul Mullowney    Output Parameter:
32979ae82921SPaul Mullowney .  A - the matrix
32989ae82921SPaul Mullowney 
32999ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
33009ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
33019ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
33029ae82921SPaul Mullowney 
33039ae82921SPaul Mullowney    Notes:
33049ae82921SPaul Mullowney    If nnz is given then nz is ignored
33059ae82921SPaul Mullowney 
33069ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
33079ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
33089ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
33099ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
33109ae82921SPaul Mullowney 
33119ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
33120298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
33139ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
33149ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
33159ae82921SPaul Mullowney 
33169ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
33179ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
33189ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
33199ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
33209ae82921SPaul Mullowney 
33219ae82921SPaul Mullowney    Level: intermediate
33229ae82921SPaul Mullowney 
3323e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
33249ae82921SPaul Mullowney @*/
33259ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
33269ae82921SPaul Mullowney {
33279ae82921SPaul Mullowney   PetscErrorCode ierr;
33289ae82921SPaul Mullowney 
33299ae82921SPaul Mullowney   PetscFunctionBegin;
33309ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
33319ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
33329ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33339ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
33349ae82921SPaul Mullowney   PetscFunctionReturn(0);
33359ae82921SPaul Mullowney }
33369ae82921SPaul Mullowney 
33376fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
33389ae82921SPaul Mullowney {
33399ae82921SPaul Mullowney   PetscErrorCode ierr;
3340ab25e6cbSDominic Meiser 
33419ae82921SPaul Mullowney   PetscFunctionBegin;
33429ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3343470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
33449ae82921SPaul Mullowney   } else {
3345470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3346aa372e3fSPaul Mullowney   }
3347c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3348ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3349365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3350ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3351ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3352fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3353ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33547e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33557e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3356ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33579ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33589ae82921SPaul Mullowney   PetscFunctionReturn(0);
33599ae82921SPaul Mullowney }
33609ae82921SPaul Mullowney 
3361ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
336295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33639ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33649ff858a8SKarl Rupp {
33659ff858a8SKarl Rupp   PetscErrorCode ierr;
33669ff858a8SKarl Rupp 
33679ff858a8SKarl Rupp   PetscFunctionBegin;
33689ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3369ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33709ff858a8SKarl Rupp   PetscFunctionReturn(0);
33719ff858a8SKarl Rupp }
33729ff858a8SKarl Rupp 
3373039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
337495639643SRichard Tran Mills {
3375e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3376a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3377039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3378039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3379039c6fbaSStefano Zampini   PetscScalar        *ay;
3380039c6fbaSStefano Zampini   const PetscScalar  *ax;
3381039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3382e6e9a74fSStefano Zampini 
338395639643SRichard Tran Mills   PetscFunctionBegin;
3384a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3385a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3386039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3387a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3388a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3389a587d139SMark     PetscFunctionReturn(0);
339095639643SRichard Tran Mills   }
3391039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3392a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3393a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3394e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3395e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3396039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3397039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3398039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3399039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3400039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3401039c6fbaSStefano Zampini     if (eq) {
3402039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3403039c6fbaSStefano Zampini     }
3404039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3405039c6fbaSStefano Zampini   }
3406d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3407d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3408039c6fbaSStefano Zampini 
3409039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3410039c6fbaSStefano Zampini     cusparseStatus_t stat;
3411039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3412039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3413039c6fbaSStefano Zampini     size_t           bufferSize;
3414039c6fbaSStefano Zampini     void             *buffer;
3415ee7b52eaSHong Zhang     cudaError_t      cerr;
3416039c6fbaSStefano Zampini #endif
3417039c6fbaSStefano Zampini 
3418039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3419039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3420039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3421039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3422039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3423039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3424039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3425039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3426039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3427039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3428039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3429039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3430039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3431039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3432039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3433039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3434039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3435039c6fbaSStefano Zampini #else
3436039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3437039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3438039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3439039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3440039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3441039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3442039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3443039c6fbaSStefano Zampini #endif
3444039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3445039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3446039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3447039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3448039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3449a587d139SMark     cublasHandle_t cublasv2handle;
3450039c6fbaSStefano Zampini     cublasStatus_t berr;
3451a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3452039c6fbaSStefano Zampini 
3453039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3454039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3455a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3456a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3457a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3458039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3459a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3460a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3461039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3462039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3463a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3464039c6fbaSStefano Zampini   } else {
3465a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3466d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3467a587d139SMark   }
346895639643SRichard Tran Mills   PetscFunctionReturn(0);
346995639643SRichard Tran Mills }
347095639643SRichard Tran Mills 
347133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
347233c9ba73SStefano Zampini {
347333c9ba73SStefano Zampini   PetscErrorCode ierr;
347433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
347533c9ba73SStefano Zampini   PetscScalar    *ay;
347633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
347733c9ba73SStefano Zampini   cublasStatus_t berr;
347833c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
347933c9ba73SStefano Zampini 
348033c9ba73SStefano Zampini   PetscFunctionBegin;
348133c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
348233c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
348333c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
348433c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
348533c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
348633c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
348733c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348833c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
348933c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
349033c9ba73SStefano Zampini   PetscFunctionReturn(0);
349133c9ba73SStefano Zampini }
349233c9ba73SStefano Zampini 
34933fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34943fa6b06aSMark Adams {
34953fa6b06aSMark Adams   PetscErrorCode ierr;
34967e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3497a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34987e8381f9SStefano Zampini 
34993fa6b06aSMark Adams   PetscFunctionBegin;
35003fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
35013fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
35027e8381f9SStefano Zampini     if (spptr->mat) {
35037e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
35047e8381f9SStefano Zampini       if (matrix->values) {
35057e8381f9SStefano Zampini         both = PETSC_TRUE;
35067e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35077e8381f9SStefano Zampini       }
35087e8381f9SStefano Zampini     }
35097e8381f9SStefano Zampini     if (spptr->matTranspose) {
35107e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
35117e8381f9SStefano Zampini       if (matrix->values) {
35127e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35137e8381f9SStefano Zampini       }
35147e8381f9SStefano Zampini     }
35153fa6b06aSMark Adams   }
3516a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3517a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3518a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
35197e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3520a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
35213fa6b06aSMark Adams   PetscFunctionReturn(0);
35223fa6b06aSMark Adams }
35233fa6b06aSMark Adams 
3524a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3525a587d139SMark {
3526a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3527a587d139SMark   PetscErrorCode ierr;
3528a587d139SMark 
3529a587d139SMark   PetscFunctionBegin;
3530*9a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
3531*9a14fc28SStefano Zampini     A->boundtocpu = flg;
3532*9a14fc28SStefano Zampini     PetscFunctionReturn(0);
3533*9a14fc28SStefano Zampini   }
3534a587d139SMark   if (flg) {
3535a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3536a587d139SMark 
353733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3538a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3539a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3540a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3541a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3542a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3543a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3544a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3545a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3546fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
354767a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3548c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3549a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3550a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3551a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3552a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3553a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3554fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3555a587d139SMark   } else {
355633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3557a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3558a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3559a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3560a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3561a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3562a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3563a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3564a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3565fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
356667a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
356767a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
356867a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
356967a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
357067a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
357167a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3572c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3573a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3574a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578a587d139SMark   }
3579a587d139SMark   A->boundtocpu = flg;
3580ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3581ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3582ea500dcfSRichard Tran Mills   } else {
3583ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3584ea500dcfSRichard Tran Mills   }
3585a587d139SMark   PetscFunctionReturn(0);
3586a587d139SMark }
3587a587d139SMark 
358849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35899ae82921SPaul Mullowney {
35909ae82921SPaul Mullowney   PetscErrorCode   ierr;
3591aa372e3fSPaul Mullowney   cusparseStatus_t stat;
359249735bf3SStefano Zampini   Mat              B;
35939ae82921SPaul Mullowney 
35949ae82921SPaul Mullowney   PetscFunctionBegin;
3595a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
359649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
359749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
359849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
359949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
360049735bf3SStefano Zampini   }
360149735bf3SStefano Zampini   B = *newmat;
360249735bf3SStefano Zampini 
360334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
360434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
360534136279SStefano Zampini 
360649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
36079ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3608e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3609e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3610e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3611a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
36121a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3613d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3614a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3615a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3616a435da06SStefano Zampini      #else
3617d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3618a435da06SStefano Zampini      #endif
3619d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3620d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3621d8132acaSStefano Zampini      #endif
36221a2c6b5cSJunchao Zhang       B->spptr = spptr;
36239ae82921SPaul Mullowney     } else {
3624e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3625e6e9a74fSStefano Zampini 
3626e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3627e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3628a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3629e6e9a74fSStefano Zampini       B->spptr = spptr;
36309ae82921SPaul Mullowney     }
3631e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
363249735bf3SStefano Zampini   }
3633693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
36349ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
36351a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
36369ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
363795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3638693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
36392205254eSKarl Rupp 
3640e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
36419ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3642bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3643ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3644ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3645ae48a8d0SStefano Zampini #endif
3646365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
36479ae82921SPaul Mullowney   PetscFunctionReturn(0);
36489ae82921SPaul Mullowney }
36499ae82921SPaul Mullowney 
365002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
365102fe1965SBarry Smith {
365202fe1965SBarry Smith   PetscErrorCode ierr;
365302fe1965SBarry Smith 
365402fe1965SBarry Smith   PetscFunctionBegin;
365502fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36560ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
365702fe1965SBarry Smith   PetscFunctionReturn(0);
365802fe1965SBarry Smith }
365902fe1965SBarry Smith 
36603ca39a21SBarry Smith /*MC
3661e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3662e057df02SPaul Mullowney 
3663e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36642692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36652692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3666e057df02SPaul Mullowney 
3667e057df02SPaul Mullowney    Options Database Keys:
3668e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3669aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3670a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3671365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3672e057df02SPaul Mullowney 
3673e057df02SPaul Mullowney   Level: beginner
3674e057df02SPaul Mullowney 
36758468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3676e057df02SPaul Mullowney M*/
36777f756511SDominic Meiser 
3678bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36790f39cd5aSBarry Smith 
36803ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
368142c9c57cSBarry Smith {
368242c9c57cSBarry Smith   PetscErrorCode ierr;
368342c9c57cSBarry Smith 
368442c9c57cSBarry Smith   PetscFunctionBegin;
3685bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36863ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36873ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3690bddcd29dSMark Adams 
369142c9c57cSBarry Smith   PetscFunctionReturn(0);
369242c9c57cSBarry Smith }
369329b38603SBarry Smith 
3694470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36957f756511SDominic Meiser {
3696e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
36977f756511SDominic Meiser   cusparseStatus_t stat;
36987f756511SDominic Meiser 
36997f756511SDominic Meiser   PetscFunctionBegin;
37007f756511SDominic Meiser   if (*cusparsestruct) {
3701e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3702e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
37037f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
370481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
37057e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
37067e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3707a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
37087e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3709e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
37107f756511SDominic Meiser   }
37117f756511SDominic Meiser   PetscFunctionReturn(0);
37127f756511SDominic Meiser }
37137f756511SDominic Meiser 
37147f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
37157f756511SDominic Meiser {
37167f756511SDominic Meiser   PetscFunctionBegin;
37177f756511SDominic Meiser   if (*mat) {
37187f756511SDominic Meiser     delete (*mat)->values;
37197f756511SDominic Meiser     delete (*mat)->column_indices;
37207f756511SDominic Meiser     delete (*mat)->row_offsets;
37217f756511SDominic Meiser     delete *mat;
37227f756511SDominic Meiser     *mat = 0;
37237f756511SDominic Meiser   }
37247f756511SDominic Meiser   PetscFunctionReturn(0);
37257f756511SDominic Meiser }
37267f756511SDominic Meiser 
3727470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
37287f756511SDominic Meiser {
37297f756511SDominic Meiser   cusparseStatus_t stat;
37307f756511SDominic Meiser   PetscErrorCode   ierr;
37317f756511SDominic Meiser 
37327f756511SDominic Meiser   PetscFunctionBegin;
37337f756511SDominic Meiser   if (*trifactor) {
373457d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3735afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
37367f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
37371b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
37382cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3739afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
37401b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3741afb2bd1cSJunchao Zhang    #endif
3742da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
37437f756511SDominic Meiser   }
37447f756511SDominic Meiser   PetscFunctionReturn(0);
37457f756511SDominic Meiser }
37467f756511SDominic Meiser 
3747470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
37487f756511SDominic Meiser {
37497f756511SDominic Meiser   CsrMatrix        *mat;
37507f756511SDominic Meiser   cusparseStatus_t stat;
37517f756511SDominic Meiser   cudaError_t      err;
37527f756511SDominic Meiser 
37537f756511SDominic Meiser   PetscFunctionBegin;
37547f756511SDominic Meiser   if (*matstruct) {
37557f756511SDominic Meiser     if ((*matstruct)->mat) {
37567f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3757afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3758afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3759afb2bd1cSJunchao Zhang        #else
37607f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
376157d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3762afb2bd1cSJunchao Zhang        #endif
37637f756511SDominic Meiser       } else {
37647f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37657f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37667f756511SDominic Meiser       }
37677f756511SDominic Meiser     }
376857d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37697f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3770afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37717656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37727656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3773afb2bd1cSJunchao Zhang 
3774afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3775afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3776afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3777afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3778afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3779afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3780afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3781afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3782afb2bd1cSJunchao Zhang       }
3783afb2bd1cSJunchao Zhang     }
3784afb2bd1cSJunchao Zhang    #endif
37857f756511SDominic Meiser     delete *matstruct;
37867e8381f9SStefano Zampini     *matstruct = NULL;
37877f756511SDominic Meiser   }
37887f756511SDominic Meiser   PetscFunctionReturn(0);
37897f756511SDominic Meiser }
37907f756511SDominic Meiser 
3791e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37927f756511SDominic Meiser {
3793e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3794e6e9a74fSStefano Zampini 
37957f756511SDominic Meiser   PetscFunctionBegin;
37967f756511SDominic Meiser   if (*trifactors) {
3797e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3798e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3799e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3800e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
38017f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
38027f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
38037f756511SDominic Meiser     delete (*trifactors)->workVector;
38047e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
38057e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
38067e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3807bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3808bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3809e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3810ccdfe979SStefano Zampini   }
3811ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3812ccdfe979SStefano Zampini }
3813ccdfe979SStefano Zampini 
3814ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3815ccdfe979SStefano Zampini {
3816e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3817ccdfe979SStefano Zampini   cusparseHandle_t handle;
3818ccdfe979SStefano Zampini   cusparseStatus_t stat;
3819ccdfe979SStefano Zampini 
3820ccdfe979SStefano Zampini   PetscFunctionBegin;
3821ccdfe979SStefano Zampini   if (*trifactors) {
3822e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
38237f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
382457d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
38257f756511SDominic Meiser     }
3826e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
38277f756511SDominic Meiser   }
38287f756511SDominic Meiser   PetscFunctionReturn(0);
38297f756511SDominic Meiser }
38307e8381f9SStefano Zampini 
38317e8381f9SStefano Zampini struct IJCompare
38327e8381f9SStefano Zampini {
38337e8381f9SStefano Zampini   __host__ __device__
38347e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38357e8381f9SStefano Zampini   {
38367e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
38377e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
38387e8381f9SStefano Zampini     return false;
38397e8381f9SStefano Zampini   }
38407e8381f9SStefano Zampini };
38417e8381f9SStefano Zampini 
38427e8381f9SStefano Zampini struct IJEqual
38437e8381f9SStefano Zampini {
38447e8381f9SStefano Zampini   __host__ __device__
38457e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38467e8381f9SStefano Zampini   {
38477e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
38487e8381f9SStefano Zampini     return true;
38497e8381f9SStefano Zampini   }
38507e8381f9SStefano Zampini };
38517e8381f9SStefano Zampini 
38527e8381f9SStefano Zampini struct IJDiff
38537e8381f9SStefano Zampini {
38547e8381f9SStefano Zampini   __host__ __device__
38557e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38567e8381f9SStefano Zampini   {
38577e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38587e8381f9SStefano Zampini   }
38597e8381f9SStefano Zampini };
38607e8381f9SStefano Zampini 
38617e8381f9SStefano Zampini struct IJSum
38627e8381f9SStefano Zampini {
38637e8381f9SStefano Zampini   __host__ __device__
38647e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38657e8381f9SStefano Zampini   {
38667e8381f9SStefano Zampini     return t1||t2;
38677e8381f9SStefano Zampini   }
38687e8381f9SStefano Zampini };
38697e8381f9SStefano Zampini 
38707e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3871e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
38727e8381f9SStefano Zampini {
38737e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3874fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3875bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
387608391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38777e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38787e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38797e8381f9SStefano Zampini   PetscInt                              n;
38807e8381f9SStefano Zampini 
38817e8381f9SStefano Zampini   PetscFunctionBegin;
38827e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38837e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38847e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38857e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38867e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38877e8381f9SStefano Zampini     PetscFunctionReturn(0);
38887e8381f9SStefano Zampini   }
38897e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38907e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3891e61fc153SStefano Zampini   if (!v) {
3892e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3893e61fc153SStefano Zampini     goto finalize;
38947e8381f9SStefano Zampini   }
3895e61fc153SStefano Zampini   n = cusp->cooPerm->size();
389608391a17SStefano Zampini   if (isCudaMem(v)) {
389708391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
389808391a17SStefano Zampini   } else {
3899e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3900e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
390108391a17SStefano Zampini     d_v = cooPerm_v->data();
3902e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
390308391a17SStefano Zampini   }
3904bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3905e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3906ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3907bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
390808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3909ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3910ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3911ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3912ddea5d60SJunchao Zhang       */
3913e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3914e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3915e61fc153SStefano Zampini       delete cooPerm_w;
39167e8381f9SStefano Zampini     } else {
3917ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
391808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39197e8381f9SStefano Zampini                                                                 matrix->values->begin()));
392008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39217e8381f9SStefano Zampini                                                                 matrix->values->end()));
3922ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
39237e8381f9SStefano Zampini     }
39247e8381f9SStefano Zampini   } else {
3925e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
392608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3927e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
39287e8381f9SStefano Zampini     } else {
392908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39307e8381f9SStefano Zampini                                                                 matrix->values->begin()));
393108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39327e8381f9SStefano Zampini                                                                 matrix->values->end()));
39337e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
39347e8381f9SStefano Zampini     }
39357e8381f9SStefano Zampini   }
3936bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3937e61fc153SStefano Zampini finalize:
3938e61fc153SStefano Zampini   delete cooPerm_v;
39397e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3940e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3941fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3942fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3943fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3944fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3945fcdce8c4SStefano Zampini   a->reallocs         = 0;
3946fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3947fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3948fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3949fcdce8c4SStefano Zampini   A->num_ass++;
39507e8381f9SStefano Zampini   PetscFunctionReturn(0);
39517e8381f9SStefano Zampini }
39527e8381f9SStefano Zampini 
3953a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3954a49f1ed0SStefano Zampini {
3955a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3956a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3957a49f1ed0SStefano Zampini 
3958a49f1ed0SStefano Zampini   PetscFunctionBegin;
3959a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3960a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3961a49f1ed0SStefano Zampini   if (destroy) {
3962a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3963a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3964a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3965a49f1ed0SStefano Zampini   }
39661a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3967a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3968a49f1ed0SStefano Zampini }
3969a49f1ed0SStefano Zampini 
39707e8381f9SStefano Zampini #include <thrust/binary_search.h>
3971e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
39727e8381f9SStefano Zampini {
39737e8381f9SStefano Zampini   PetscErrorCode     ierr;
39747e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39757e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39767e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39777e8381f9SStefano Zampini   cudaError_t        cerr;
39787e8381f9SStefano Zampini 
39797e8381f9SStefano Zampini   PetscFunctionBegin;
39807e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39817e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39827e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39837e8381f9SStefano Zampini   if (n != cooPerm_n) {
39847e8381f9SStefano Zampini     delete cusp->cooPerm;
39857e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39867e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39877e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39887e8381f9SStefano Zampini   }
39897e8381f9SStefano Zampini   if (n) {
39907e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39917e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39927e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39937e8381f9SStefano Zampini 
39947e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39957e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39967e8381f9SStefano Zampini 
39977e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
39987e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39997e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4000ddea5d60SJunchao Zhang 
4001ddea5d60SJunchao Zhang     /* Ex.
4002ddea5d60SJunchao Zhang       n = 6
4003ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4004ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4005ddea5d60SJunchao Zhang     */
40067e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
40077e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
40087e8381f9SStefano Zampini 
400908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40107e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4011ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4012ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
40137e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
40147e8381f9SStefano Zampini 
4015ddea5d60SJunchao Zhang     /*
4016ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4017ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4018ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4019ddea5d60SJunchao Zhang     */
4020ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4021ddea5d60SJunchao Zhang 
4022ddea5d60SJunchao Zhang     /*
4023ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4024ddea5d60SJunchao Zhang                             ^ekey
4025ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4026ddea5d60SJunchao Zhang                            ^nekye
4027ddea5d60SJunchao Zhang     */
40287e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
40297e8381f9SStefano Zampini       delete cusp->cooPerm_a;
40307e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4031ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4032ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4033ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4034ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4035ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
40367e8381f9SStefano Zampini       w[0] = 0;
4037ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4038ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
40397e8381f9SStefano Zampini     }
40407e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4041ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4042ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4043ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
404408391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40457e8381f9SStefano Zampini 
40467e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
40477e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
40487e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
40497e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
40507e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4051ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40527e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40537e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4054fcdce8c4SStefano Zampini     a->rmax = 0;
40557e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40567e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40577e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40587e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40597e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40607e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40617e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40627e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40637e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4064fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40657e8381f9SStefano Zampini     }
4066fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40677e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40687e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4069fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40707e8381f9SStefano Zampini   } else {
40717e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40727e8381f9SStefano Zampini   }
4073e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40747e8381f9SStefano Zampini 
40757e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4076e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4077e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40787e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40797e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40807e8381f9SStefano Zampini   A->nonzerostate++;
40817e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4082a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40837e8381f9SStefano Zampini 
40847e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40857e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40867e8381f9SStefano Zampini   PetscFunctionReturn(0);
40877e8381f9SStefano Zampini }
4088ed502f03SStefano Zampini 
40895b7e41feSStefano Zampini /*@C
40905b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40915b7e41feSStefano Zampini 
40925b7e41feSStefano Zampini    Not collective
40935b7e41feSStefano Zampini 
40945b7e41feSStefano Zampini     Input Parameters:
40955b7e41feSStefano Zampini +   A - the matrix
40965b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40975b7e41feSStefano Zampini 
40985b7e41feSStefano Zampini     Output Parameters:
40995b7e41feSStefano Zampini +   ia - the CSR row pointers
41005b7e41feSStefano Zampini -   ja - the CSR column indices
41015b7e41feSStefano Zampini 
41025b7e41feSStefano Zampini     Level: developer
41035b7e41feSStefano Zampini 
41045b7e41feSStefano Zampini     Notes:
41055b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
41065b7e41feSStefano Zampini 
41075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
41085b7e41feSStefano Zampini @*/
41095f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41105f101d05SStefano Zampini {
41115f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41125f101d05SStefano Zampini   CsrMatrix          *csr;
41135f101d05SStefano Zampini   PetscErrorCode     ierr;
41145f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41155f101d05SStefano Zampini 
41165f101d05SStefano Zampini   PetscFunctionBegin;
41175f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41185f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41195f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41205f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41215f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
41225f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41235f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41245f101d05SStefano Zampini   if (i) {
41255f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41265f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41275f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41285f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41295f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
41305f101d05SStefano Zampini       }
41315f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41325f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41335f101d05SStefano Zampini   }
41345f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41355f101d05SStefano Zampini   PetscFunctionReturn(0);
41365f101d05SStefano Zampini }
41375f101d05SStefano Zampini 
41385b7e41feSStefano Zampini /*@C
41395b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41405b7e41feSStefano Zampini 
41415b7e41feSStefano Zampini    Not collective
41425b7e41feSStefano Zampini 
41435b7e41feSStefano Zampini     Input Parameters:
41445b7e41feSStefano Zampini +   A - the matrix
41455b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41465b7e41feSStefano Zampini 
41475b7e41feSStefano Zampini     Output Parameters:
41485b7e41feSStefano Zampini +   ia - the CSR row pointers
41495b7e41feSStefano Zampini -   ja - the CSR column indices
41505b7e41feSStefano Zampini 
41515b7e41feSStefano Zampini     Level: developer
41525b7e41feSStefano Zampini 
41535b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41545b7e41feSStefano Zampini @*/
41555f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41565f101d05SStefano Zampini {
41575f101d05SStefano Zampini   PetscFunctionBegin;
41585f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41595f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41605f101d05SStefano Zampini   if (i) *i = NULL;
41615f101d05SStefano Zampini   if (j) *j = NULL;
41625f101d05SStefano Zampini   PetscFunctionReturn(0);
41635f101d05SStefano Zampini }
41645f101d05SStefano Zampini 
41655b7e41feSStefano Zampini /*@C
41665b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41675b7e41feSStefano Zampini 
41685b7e41feSStefano Zampini    Not Collective
41695b7e41feSStefano Zampini 
41705b7e41feSStefano Zampini    Input Parameter:
41715b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41725b7e41feSStefano Zampini 
41735b7e41feSStefano Zampini    Output Parameter:
41745b7e41feSStefano Zampini .   a - pointer to the device data
41755b7e41feSStefano Zampini 
41765b7e41feSStefano Zampini    Level: developer
41775b7e41feSStefano Zampini 
41785b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41795b7e41feSStefano Zampini 
41805b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41815b7e41feSStefano Zampini @*/
4182ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4183ed502f03SStefano Zampini {
4184ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4185ed502f03SStefano Zampini   CsrMatrix          *csr;
4186ed502f03SStefano Zampini   PetscErrorCode     ierr;
4187ed502f03SStefano Zampini 
4188ed502f03SStefano Zampini   PetscFunctionBegin;
4189ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4190ed502f03SStefano Zampini   PetscValidPointer(a,2);
4191ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4192ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4193ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
419433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4195ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4196ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4197ed502f03SStefano Zampini   *a = csr->values->data().get();
4198ed502f03SStefano Zampini   PetscFunctionReturn(0);
4199ed502f03SStefano Zampini }
4200ed502f03SStefano Zampini 
42015b7e41feSStefano Zampini /*@C
42025b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
42035b7e41feSStefano Zampini 
42045b7e41feSStefano Zampini    Not Collective
42055b7e41feSStefano Zampini 
42065b7e41feSStefano Zampini    Input Parameter:
42075b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42085b7e41feSStefano Zampini 
42095b7e41feSStefano Zampini    Output Parameter:
42105b7e41feSStefano Zampini .   a - pointer to the device data
42115b7e41feSStefano Zampini 
42125b7e41feSStefano Zampini    Level: developer
42135b7e41feSStefano Zampini 
42145b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42155b7e41feSStefano Zampini @*/
4216ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4217ed502f03SStefano Zampini {
4218ed502f03SStefano Zampini   PetscFunctionBegin;
4219ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4220ed502f03SStefano Zampini   PetscValidPointer(a,2);
4221ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4222ed502f03SStefano Zampini   *a = NULL;
4223ed502f03SStefano Zampini   PetscFunctionReturn(0);
4224ed502f03SStefano Zampini }
4225ed502f03SStefano Zampini 
42265b7e41feSStefano Zampini /*@C
42275b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42285b7e41feSStefano Zampini 
42295b7e41feSStefano Zampini    Not Collective
42305b7e41feSStefano Zampini 
42315b7e41feSStefano Zampini    Input Parameter:
42325b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42335b7e41feSStefano Zampini 
42345b7e41feSStefano Zampini    Output Parameter:
42355b7e41feSStefano Zampini .   a - pointer to the device data
42365b7e41feSStefano Zampini 
42375b7e41feSStefano Zampini    Level: developer
42385b7e41feSStefano Zampini 
42395b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42405b7e41feSStefano Zampini 
42415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42425b7e41feSStefano Zampini @*/
4243039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4244039c6fbaSStefano Zampini {
4245039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4246039c6fbaSStefano Zampini   CsrMatrix          *csr;
4247039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4248039c6fbaSStefano Zampini 
4249039c6fbaSStefano Zampini   PetscFunctionBegin;
4250039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4251039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4252039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4253039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4254039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
425533c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4256039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4257039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4258039c6fbaSStefano Zampini   *a = csr->values->data().get();
4259039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4260a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4261039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4262039c6fbaSStefano Zampini }
42635b7e41feSStefano Zampini /*@C
42645b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4265039c6fbaSStefano Zampini 
42665b7e41feSStefano Zampini    Not Collective
42675b7e41feSStefano Zampini 
42685b7e41feSStefano Zampini    Input Parameter:
42695b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42705b7e41feSStefano Zampini 
42715b7e41feSStefano Zampini    Output Parameter:
42725b7e41feSStefano Zampini .   a - pointer to the device data
42735b7e41feSStefano Zampini 
42745b7e41feSStefano Zampini    Level: developer
42755b7e41feSStefano Zampini 
42765b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42775b7e41feSStefano Zampini @*/
4278039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4279039c6fbaSStefano Zampini {
4280039c6fbaSStefano Zampini   PetscErrorCode ierr;
4281039c6fbaSStefano Zampini 
4282039c6fbaSStefano Zampini   PetscFunctionBegin;
4283039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4284039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4285039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4286039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4287039c6fbaSStefano Zampini   *a = NULL;
4288039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4289039c6fbaSStefano Zampini }
4290039c6fbaSStefano Zampini 
42915b7e41feSStefano Zampini /*@C
42925b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42935b7e41feSStefano Zampini 
42945b7e41feSStefano Zampini    Not Collective
42955b7e41feSStefano Zampini 
42965b7e41feSStefano Zampini    Input Parameter:
42975b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42985b7e41feSStefano Zampini 
42995b7e41feSStefano Zampini    Output Parameter:
43005b7e41feSStefano Zampini .   a - pointer to the device data
43015b7e41feSStefano Zampini 
43025b7e41feSStefano Zampini    Level: developer
43035b7e41feSStefano Zampini 
43045b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
43055b7e41feSStefano Zampini 
43065b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
43075b7e41feSStefano Zampini @*/
4308ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4309ed502f03SStefano Zampini {
4310ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4311ed502f03SStefano Zampini   CsrMatrix          *csr;
4312a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4313ed502f03SStefano Zampini 
4314ed502f03SStefano Zampini   PetscFunctionBegin;
4315ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4316ed502f03SStefano Zampini   PetscValidPointer(a,2);
4317ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4318ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
431933c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4320ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4321ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4322ed502f03SStefano Zampini   *a = csr->values->data().get();
4323039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4324a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4325ed502f03SStefano Zampini   PetscFunctionReturn(0);
4326ed502f03SStefano Zampini }
4327ed502f03SStefano Zampini 
43285b7e41feSStefano Zampini /*@C
43295b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43305b7e41feSStefano Zampini 
43315b7e41feSStefano Zampini    Not Collective
43325b7e41feSStefano Zampini 
43335b7e41feSStefano Zampini    Input Parameter:
43345b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43355b7e41feSStefano Zampini 
43365b7e41feSStefano Zampini    Output Parameter:
43375b7e41feSStefano Zampini .   a - pointer to the device data
43385b7e41feSStefano Zampini 
43395b7e41feSStefano Zampini    Level: developer
43405b7e41feSStefano Zampini 
43415b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43425b7e41feSStefano Zampini @*/
4343ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4344ed502f03SStefano Zampini {
4345ed502f03SStefano Zampini   PetscErrorCode ierr;
4346ed502f03SStefano Zampini 
4347ed502f03SStefano Zampini   PetscFunctionBegin;
4348ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4349ed502f03SStefano Zampini   PetscValidPointer(a,2);
4350ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4351ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4352ed502f03SStefano Zampini   *a = NULL;
4353ed502f03SStefano Zampini   PetscFunctionReturn(0);
4354ed502f03SStefano Zampini }
4355ed502f03SStefano Zampini 
4356ed502f03SStefano Zampini struct IJCompare4
4357ed502f03SStefano Zampini {
4358ed502f03SStefano Zampini   __host__ __device__
43592ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4360ed502f03SStefano Zampini   {
4361ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4362ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4363ed502f03SStefano Zampini     return false;
4364ed502f03SStefano Zampini   }
4365ed502f03SStefano Zampini };
4366ed502f03SStefano Zampini 
43678909a122SStefano Zampini struct Shift
43688909a122SStefano Zampini {
4369ed502f03SStefano Zampini   int _shift;
4370ed502f03SStefano Zampini 
4371ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4372ed502f03SStefano Zampini   __host__ __device__
4373ed502f03SStefano Zampini   inline int operator() (const int &c)
4374ed502f03SStefano Zampini   {
4375ed502f03SStefano Zampini     return c + _shift;
4376ed502f03SStefano Zampini   }
4377ed502f03SStefano Zampini };
4378ed502f03SStefano Zampini 
4379ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4380ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4381ed502f03SStefano Zampini {
4382ed502f03SStefano Zampini   PetscErrorCode               ierr;
4383ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4384ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4385ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4386ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4387ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4388ed502f03SStefano Zampini   cusparseStatus_t             stat;
4389ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4390ed502f03SStefano Zampini   cudaError_t                  cerr;
4391ed502f03SStefano Zampini 
4392ed502f03SStefano Zampini   PetscFunctionBegin;
4393ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4394ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4395ed502f03SStefano Zampini   PetscValidPointer(C,4);
4396ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4397ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4398ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4399ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4400ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4401ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4402ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4403ed502f03SStefano Zampini     m     = A->rmap->n;
4404ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4405ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4406ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4407ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4408ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4409ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4410ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4411ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4412ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4413ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4414ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4415ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4416ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4417ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4418ed502f03SStefano Zampini     Ccusp->nrows    = m;
4419ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4420ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4421ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4422ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4423ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4424ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4425ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4426ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4427ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4428ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4429ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4430ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4431ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4433ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4434ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4435ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4436ed502f03SStefano Zampini 
4437ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4438ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4439ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4440ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4441ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4442ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4443ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4444ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4445ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4446ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4447ed502f03SStefano Zampini     if (c->nz) {
44482ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44492ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44502ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44512ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44522ed87e7eSStefano Zampini 
4453ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4454ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4455ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4456ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4457ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4458ed502f03SStefano Zampini         }
44592ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44602ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4461ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4462ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4463ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4464ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4465ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4466ed502f03SStefano Zampini         }
44672ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44682ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4469ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
44702ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44712ed87e7eSStefano Zampini                               Aroff->data().get(),
44722ed87e7eSStefano Zampini                               Annz,
44732ed87e7eSStefano Zampini                               m,
44742ed87e7eSStefano Zampini                               Acoo->data().get(),
44752ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4476ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44772ed87e7eSStefano Zampini                               Broff->data().get(),
4478ed502f03SStefano Zampini                               Bnnz,
4479ed502f03SStefano Zampini                               m,
44802ed87e7eSStefano Zampini                               Bcoo->data().get(),
4481ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44822ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44832ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44842ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44858909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4486ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4487ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44888909a122SStefano Zampini #else
44898909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44908909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44918909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44928909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44938909a122SStefano Zampini #endif
44942ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44952ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44962ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44972ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44982ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44992ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4500ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4501ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4502ed502f03SStefano Zampini       thrust::advance(p2,Annz);
45032ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
45048909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
45058909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
45068909a122SStefano Zampini #endif
45072ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
45082ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
45092ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
45102ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
45112ed87e7eSStefano Zampini #else
45122ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
45132ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
45142ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
45152ed87e7eSStefano Zampini #endif
4516ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
45172ed87e7eSStefano Zampini                               Ccoo->data().get(),
4518ed502f03SStefano Zampini                               c->nz,
4519ed502f03SStefano Zampini                               m,
4520ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4521ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4522ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
45232ed87e7eSStefano Zampini       delete wPerm;
45242ed87e7eSStefano Zampini       delete Acoo;
45252ed87e7eSStefano Zampini       delete Bcoo;
45262ed87e7eSStefano Zampini       delete Ccoo;
4527ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4528ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4529ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4530ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4531ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4532ed502f03SStefano Zampini #endif
45331a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45343606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
45353606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4536ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4537ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4538ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4539ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4540ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4541ed502f03SStefano Zampini 
45421a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45431a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4544a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4545ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4546ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4547ed502f03SStefano Zampini         CcsrT->num_rows = n;
4548ed502f03SStefano Zampini         CcsrT->num_cols = m;
4549ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4550ed502f03SStefano Zampini 
4551ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4552ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4553ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4554ed502f03SStefano Zampini 
4555ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4556ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4557ed502f03SStefano Zampini         if (AT) {
4558ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4559ed502f03SStefano Zampini           thrust::advance(rT,-1);
4560ed502f03SStefano Zampini         }
4561ed502f03SStefano Zampini         if (BT) {
4562ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4563ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4564ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4565ed502f03SStefano Zampini         }
4566ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4567ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4568ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4569ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4570ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4571ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4572ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4573ed502f03SStefano Zampini 
4574ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4575ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4576ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4577ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4578ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4579ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4580ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4581ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4582ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4584ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4585ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4586ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4587ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4588ed502f03SStefano Zampini #endif
4589ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4590ed502f03SStefano Zampini       }
4591ed502f03SStefano Zampini     }
4592ed502f03SStefano Zampini 
4593ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4594ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4595ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4596ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4597ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4598ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4599ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4600ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4601ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4602ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4603ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4604ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4605ed502f03SStefano Zampini     } else {
4606ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4607ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4608ed502f03SStefano Zampini     }
4609ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4610ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4611ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4612ed502f03SStefano Zampini     c->maxnz = c->nz;
4613ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4614ed502f03SStefano Zampini     c->rmax = 0;
4615ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4616ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4617ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4618ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4619ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4620ed502f03SStefano Zampini     }
4621ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4622ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4623ed502f03SStefano Zampini     (*C)->nonzerostate++;
4624ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4625ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4626ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4627ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4628ed502f03SStefano Zampini   } else {
4629ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4630ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4631ed502f03SStefano Zampini     if (c->nz) {
4632ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4633ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4634ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4635ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4636ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4637ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4638ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4639ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4640ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4641ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4642ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4643ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4644ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4645ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4646ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4647ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4648ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4649ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4650ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4651ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4652ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4653ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4654ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4655ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4656ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4657ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4658ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4659ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4660ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4661a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
46621a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4663ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4664ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4665ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4666ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4667ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4668ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4669ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4670ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46711a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4672ed502f03SStefano Zampini       }
4673ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4674ed502f03SStefano Zampini     }
4675ed502f03SStefano Zampini   }
4676ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4677ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4678ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4679ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4680ed502f03SStefano Zampini   PetscFunctionReturn(0);
4681ed502f03SStefano Zampini }
4682c215019aSStefano Zampini 
4683c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4684c215019aSStefano Zampini {
4685c215019aSStefano Zampini   PetscErrorCode    ierr;
4686c215019aSStefano Zampini   bool              dmem;
4687c215019aSStefano Zampini   const PetscScalar *av;
4688c215019aSStefano Zampini   cudaError_t       cerr;
4689c215019aSStefano Zampini 
4690c215019aSStefano Zampini   PetscFunctionBegin;
4691c215019aSStefano Zampini   dmem = isCudaMem(v);
4692c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4693c215019aSStefano Zampini   if (n && idx) {
4694c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4695c215019aSStefano Zampini     widx.assign(idx,idx+n);
4696c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4697c215019aSStefano Zampini 
4698c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4699c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4700c215019aSStefano Zampini     if (dmem) {
4701c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4702c215019aSStefano Zampini     } else {
4703c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4704c215019aSStefano Zampini       dv = w->data();
4705c215019aSStefano Zampini     }
4706c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4707c215019aSStefano Zampini 
4708c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4709c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4710c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4711c215019aSStefano Zampini     if (w) {
4712c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4713c215019aSStefano Zampini     }
4714c215019aSStefano Zampini     delete w;
4715c215019aSStefano Zampini   } else {
4716c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4717c215019aSStefano Zampini   }
4718c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4719c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4720c215019aSStefano Zampini   PetscFunctionReturn(0);
4721c215019aSStefano Zampini }
4722