xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 67a457605a8a8f203835aee407e0fdacb8a7b609)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
16e8d2b73aSMark Adams 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8657181aedSStefano Zampini 
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
897e8381f9SStefano Zampini 
90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91c215019aSStefano Zampini 
92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93b06137fdSPaul Mullowney {
94b06137fdSPaul Mullowney   cusparseStatus_t   stat;
95b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney   PetscFunctionBegin;
98d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10057d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101b06137fdSPaul Mullowney   PetscFunctionReturn(0);
102b06137fdSPaul Mullowney }
103b06137fdSPaul Mullowney 
104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105b06137fdSPaul Mullowney {
106b06137fdSPaul Mullowney   cusparseStatus_t   stat;
107b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney   PetscFunctionBegin;
110d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1116b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11216a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11357d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11416a2e217SAlejandro Lamas Daviña     }
115b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1166b1cf21dSAlejandro Lamas Daviña   }
11757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118b06137fdSPaul Mullowney   PetscFunctionReturn(0);
119b06137fdSPaul Mullowney }
120b06137fdSPaul Mullowney 
121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122b06137fdSPaul Mullowney {
123b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1247e8381f9SStefano Zampini   PetscBool          flg;
1257e8381f9SStefano Zampini   PetscErrorCode     ierr;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1287e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
1569ae82921SPaul Mullowney   PetscErrorCode ierr;
157bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1589ae82921SPaul Mullowney 
1599ae82921SPaul Mullowney   PetscFunctionBegin;
160bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1622c7c0729SBarry Smith   (*B)->factortype = ftype;
1639ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642205254eSKarl Rupp 
1659c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
166087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16733d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1689c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1699ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1709ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1719c1083e7SRichard Tran Mills     } else {
1729c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1739c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1749c1083e7SRichard Tran Mills     }
1754ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1774ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1799c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
180087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
181087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1829c1083e7SRichard Tran Mills     } else {
1839c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1849c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1859c1083e7SRichard Tran Mills     }
1864ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1874ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1889ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
189bc3f50f2SPaul Mullowney 
190fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1914ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1923ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1939ae82921SPaul Mullowney   PetscFunctionReturn(0);
1949ae82921SPaul Mullowney }
1959ae82921SPaul Mullowney 
196bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
197ca45077fSPaul Mullowney {
198aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1996e111a19SKarl Rupp 
200ca45077fSPaul Mullowney   PetscFunctionBegin;
201ca45077fSPaul Mullowney   switch (op) {
202e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
203aa372e3fSPaul Mullowney     cusparsestruct->format = format;
204ca45077fSPaul Mullowney     break;
205e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
206aa372e3fSPaul Mullowney     cusparsestruct->format = format;
207ca45077fSPaul Mullowney     break;
208ca45077fSPaul Mullowney   default:
20936d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
210ca45077fSPaul Mullowney   }
211ca45077fSPaul Mullowney   PetscFunctionReturn(0);
212ca45077fSPaul Mullowney }
2139ae82921SPaul Mullowney 
214e057df02SPaul Mullowney /*@
215e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
216e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
217aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
218e057df02SPaul Mullowney    Not Collective
219e057df02SPaul Mullowney 
220e057df02SPaul Mullowney    Input Parameters:
2218468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2232692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
224e057df02SPaul Mullowney 
225e057df02SPaul Mullowney    Output Parameter:
226e057df02SPaul Mullowney 
227e057df02SPaul Mullowney    Level: intermediate
228e057df02SPaul Mullowney 
2298468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
230e057df02SPaul Mullowney @*/
231e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
232e057df02SPaul Mullowney {
233e057df02SPaul Mullowney   PetscErrorCode ierr;
2346e111a19SKarl Rupp 
235e057df02SPaul Mullowney   PetscFunctionBegin;
236e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
237e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
238e057df02SPaul Mullowney   PetscFunctionReturn(0);
239e057df02SPaul Mullowney }
240e057df02SPaul Mullowney 
2411a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
242e6e9a74fSStefano Zampini {
243e6e9a74fSStefano Zampini   PetscErrorCode ierr;
244e6e9a74fSStefano Zampini 
245e6e9a74fSStefano Zampini   PetscFunctionBegin;
2461a2c6b5cSJunchao Zhang   switch (op) {
2471a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2481a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2491a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2501a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2511a2c6b5cSJunchao Zhang       break;
2521a2c6b5cSJunchao Zhang     default:
2531a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2541a2c6b5cSJunchao Zhang       break;
255e6e9a74fSStefano Zampini   }
256e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
257e6e9a74fSStefano Zampini }
258e6e9a74fSStefano Zampini 
259bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
260bddcd29dSMark Adams 
261bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
262bddcd29dSMark Adams {
263bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
264bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
265bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
266bddcd29dSMark Adams   PetscErrorCode ierr;
267bddcd29dSMark Adams 
268bddcd29dSMark Adams   PetscFunctionBegin;
269bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
270bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
271bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
272bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
273bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
274bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
275bddcd29dSMark Adams   if (row_identity && col_identity) {
276bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
277bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
278bddcd29dSMark Adams     B->ops->matsolve = NULL;
279bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
280bddcd29dSMark Adams   } else {
281bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
282bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
283bddcd29dSMark Adams     B->ops->matsolve = NULL;
284bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
285bddcd29dSMark Adams   }
286bddcd29dSMark Adams 
287bddcd29dSMark Adams   /* get the triangular factors */
288bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
289bddcd29dSMark Adams   PetscFunctionReturn(0);
290bddcd29dSMark Adams }
291bddcd29dSMark Adams 
2924416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2939ae82921SPaul Mullowney {
2949ae82921SPaul Mullowney   PetscErrorCode           ierr;
295e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2969ae82921SPaul Mullowney   PetscBool                flg;
297a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2986e111a19SKarl Rupp 
2999ae82921SPaul Mullowney   PetscFunctionBegin;
300e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3019ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
302e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
303a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
304afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
305afb2bd1cSJunchao Zhang 
3064c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
307a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
308afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
309afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
310afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
311afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
312afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
313a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
314a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315a435da06SStefano Zampini #else
316afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
317a435da06SStefano Zampini #endif
318afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
319afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
320afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
321afb2bd1cSJunchao Zhang 
322afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
323afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
324afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
325afb2bd1cSJunchao Zhang    #endif
3264c87dfd4SPaul Mullowney   }
3270af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3289ae82921SPaul Mullowney   PetscFunctionReturn(0);
3299ae82921SPaul Mullowney }
3309ae82921SPaul Mullowney 
3316fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3329ae82921SPaul Mullowney {
333da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3349ae82921SPaul Mullowney   PetscErrorCode               ierr;
3359ae82921SPaul Mullowney 
3369ae82921SPaul Mullowney   PetscFunctionBegin;
337da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3399ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3409ae82921SPaul Mullowney   PetscFunctionReturn(0);
3419ae82921SPaul Mullowney }
3429ae82921SPaul Mullowney 
3436fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3449ae82921SPaul Mullowney {
345da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3469ae82921SPaul Mullowney   PetscErrorCode               ierr;
3479ae82921SPaul Mullowney 
3489ae82921SPaul Mullowney   PetscFunctionBegin;
349da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3509ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3519ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3529ae82921SPaul Mullowney   PetscFunctionReturn(0);
3539ae82921SPaul Mullowney }
3549ae82921SPaul Mullowney 
355087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
356087f3262SPaul Mullowney {
357da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
358087f3262SPaul Mullowney   PetscErrorCode               ierr;
359087f3262SPaul Mullowney 
360087f3262SPaul Mullowney   PetscFunctionBegin;
361da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
362087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
363087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
364087f3262SPaul Mullowney   PetscFunctionReturn(0);
365087f3262SPaul Mullowney }
366087f3262SPaul Mullowney 
367087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
368087f3262SPaul Mullowney {
369da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
370087f3262SPaul Mullowney   PetscErrorCode               ierr;
371087f3262SPaul Mullowney 
372087f3262SPaul Mullowney   PetscFunctionBegin;
373da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
374087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
375087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
376087f3262SPaul Mullowney   PetscFunctionReturn(0);
377087f3262SPaul Mullowney }
378087f3262SPaul Mullowney 
379087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3809ae82921SPaul Mullowney {
3819ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3829ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3839ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
384aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3859ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3869ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3879ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3889ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3899ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
390b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
39157d48284SJunchao Zhang   cudaError_t                       cerr;
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney   PetscFunctionBegin;
394cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
395c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3969ae82921SPaul Mullowney     try {
3979ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3989ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
399da79fbbcSStefano Zampini       if (!loTriFactor) {
4002cbc15d9SMark         PetscScalar                       *AALo;
4012cbc15d9SMark 
4022cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4039ae82921SPaul Mullowney 
4049ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
40557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
40657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4079ae82921SPaul Mullowney 
4089ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4099ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4109ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4119ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4129ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4139ae82921SPaul Mullowney         v        = aa;
4149ae82921SPaul Mullowney         vi       = aj;
4159ae82921SPaul Mullowney         offset   = 1;
4169ae82921SPaul Mullowney         rowOffset= 1;
4179ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4189ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
419e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4209ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4219ae82921SPaul Mullowney           rowOffset += nz+1;
4229ae82921SPaul Mullowney 
423580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
424580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4259ae82921SPaul Mullowney 
4269ae82921SPaul Mullowney           offset      += nz;
4279ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4289ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4299ae82921SPaul Mullowney           offset      += 1;
4309ae82921SPaul Mullowney 
4319ae82921SPaul Mullowney           v  += nz;
4329ae82921SPaul Mullowney           vi += nz;
4339ae82921SPaul Mullowney         }
4342205254eSKarl Rupp 
435aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
436da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
437da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
438aa372e3fSPaul Mullowney         /* Create the matrix description */
43957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
44057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4411b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
442afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
443afb2bd1cSJunchao Zhang        #else
44457d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
445afb2bd1cSJunchao Zhang        #endif
44657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
44757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney         /* set the operation */
450aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
451aa372e3fSPaul Mullowney 
452aa372e3fSPaul Mullowney         /* set the matrix */
453aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
454aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
455aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
456aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
457aa372e3fSPaul Mullowney 
458aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
459aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
460aa372e3fSPaul Mullowney 
461aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
462aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
463aa372e3fSPaul Mullowney 
464aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
465aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
466aa372e3fSPaul Mullowney 
467afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
468da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
469afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4701b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
472afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
475afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
476afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
477afb2bd1cSJunchao Zhang       #endif
478afb2bd1cSJunchao Zhang 
479aa372e3fSPaul Mullowney         /* perform the solve analysis */
480aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
481aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
482aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
483d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
4841b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
485d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
486d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
487d49cd2b7SBarry Smith                                #else
488d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
489afb2bd1cSJunchao Zhang                                #endif
490da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
491da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
492aa372e3fSPaul Mullowney 
493da79fbbcSStefano Zampini         /* assign the pointer */
494aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4952cbc15d9SMark         loTriFactor->AA_h = AALo;
49657d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
49757d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4984863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
499da79fbbcSStefano Zampini       } else { /* update values only */
5002cbc15d9SMark         if (!loTriFactor->AA_h) {
5012cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5022cbc15d9SMark         }
503da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5042cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
505da79fbbcSStefano Zampini         v        = aa;
506da79fbbcSStefano Zampini         vi       = aj;
507da79fbbcSStefano Zampini         offset   = 1;
508da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
509da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5102cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
511da79fbbcSStefano Zampini           offset      += nz;
5122cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
513da79fbbcSStefano Zampini           offset      += 1;
514da79fbbcSStefano Zampini           v  += nz;
515da79fbbcSStefano Zampini         }
5162cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
517da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
518da79fbbcSStefano Zampini       }
5199ae82921SPaul Mullowney     } catch(char *ex) {
5209ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5219ae82921SPaul Mullowney     }
5229ae82921SPaul Mullowney   }
5239ae82921SPaul Mullowney   PetscFunctionReturn(0);
5249ae82921SPaul Mullowney }
5259ae82921SPaul Mullowney 
526087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5279ae82921SPaul Mullowney {
5289ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5299ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
531aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5329ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5339ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5349ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5359ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5369ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5379ae82921SPaul Mullowney   PetscErrorCode                    ierr;
53857d48284SJunchao Zhang   cudaError_t                       cerr;
5399ae82921SPaul Mullowney 
5409ae82921SPaul Mullowney   PetscFunctionBegin;
541cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
542c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5439ae82921SPaul Mullowney     try {
5449ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5459ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
546da79fbbcSStefano Zampini       if (!upTriFactor) {
5472cbc15d9SMark         PetscScalar *AAUp;
5482cbc15d9SMark 
5492cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5502cbc15d9SMark 
5519ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
55257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
55357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5549ae82921SPaul Mullowney 
5559ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5569ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5579ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5589ae82921SPaul Mullowney         offset = nzUpper;
5599ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5609ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5619ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5629ae82921SPaul Mullowney 
563e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5649ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5659ae82921SPaul Mullowney 
566e057df02SPaul Mullowney           /* decrement the offset */
5679ae82921SPaul Mullowney           offset -= (nz+1);
5689ae82921SPaul Mullowney 
569e057df02SPaul Mullowney           /* first, set the diagonal elements */
5709ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
57109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5729ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5739ae82921SPaul Mullowney 
574580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
575580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5769ae82921SPaul Mullowney         }
5772205254eSKarl Rupp 
578aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
579da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
580da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5812205254eSKarl Rupp 
582aa372e3fSPaul Mullowney         /* Create the matrix description */
58357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
58457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5851b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
587afb2bd1cSJunchao Zhang        #else
58857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
589afb2bd1cSJunchao Zhang        #endif
59057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
59157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney         /* set the operation */
594aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
595aa372e3fSPaul Mullowney 
596aa372e3fSPaul Mullowney         /* set the matrix */
597aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
598aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
599aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
600aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
601aa372e3fSPaul Mullowney 
602aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
603aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
604aa372e3fSPaul Mullowney 
605aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
606aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
607aa372e3fSPaul Mullowney 
608aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
609aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
610aa372e3fSPaul Mullowney 
611afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
612da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
613afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6141b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
616afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
617afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
618afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
619afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
620afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
621afb2bd1cSJunchao Zhang       #endif
622afb2bd1cSJunchao Zhang 
623aa372e3fSPaul Mullowney         /* perform the solve analysis */
624aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
625aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
626aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
627d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6281b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
629d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
630d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
631d49cd2b7SBarry Smith                                #else
632d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
633afb2bd1cSJunchao Zhang                                #endif
634da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
635da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
636aa372e3fSPaul Mullowney 
637da79fbbcSStefano Zampini         /* assign the pointer */
638aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6392cbc15d9SMark         upTriFactor->AA_h = AAUp;
64057d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
64157d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6424863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
643da79fbbcSStefano Zampini       } else {
6442cbc15d9SMark         if (!upTriFactor->AA_h) {
6452cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6462cbc15d9SMark         }
647da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
648da79fbbcSStefano Zampini         offset = nzUpper;
649da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
650da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
651da79fbbcSStefano Zampini 
652da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
653da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
654da79fbbcSStefano Zampini 
655da79fbbcSStefano Zampini           /* decrement the offset */
656da79fbbcSStefano Zampini           offset -= (nz+1);
657da79fbbcSStefano Zampini 
658da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6592cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6602cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
661da79fbbcSStefano Zampini         }
6622cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
663da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
664da79fbbcSStefano Zampini       }
6659ae82921SPaul Mullowney     } catch(char *ex) {
6669ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6679ae82921SPaul Mullowney     }
6689ae82921SPaul Mullowney   }
6699ae82921SPaul Mullowney   PetscFunctionReturn(0);
6709ae82921SPaul Mullowney }
6719ae82921SPaul Mullowney 
672087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6739ae82921SPaul Mullowney {
6749ae82921SPaul Mullowney   PetscErrorCode               ierr;
6759ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6769ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6779ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6789ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6799ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6809ae82921SPaul Mullowney 
6819ae82921SPaul Mullowney   PetscFunctionBegin;
682da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
683087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
684087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6852205254eSKarl Rupp 
686da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
687aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6889ae82921SPaul Mullowney 
689c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
690e057df02SPaul Mullowney   /* lower triangular indices */
6919ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
692da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
693da79fbbcSStefano Zampini     const PetscInt *r;
694da79fbbcSStefano Zampini 
695da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
696aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
697aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6989ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
699da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
700da79fbbcSStefano Zampini   }
7019ae82921SPaul Mullowney 
702e057df02SPaul Mullowney   /* upper triangular indices */
7039ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
704da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
705da79fbbcSStefano Zampini     const PetscInt *c;
706da79fbbcSStefano Zampini 
707da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
708aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
709aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7109ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
711da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
712da79fbbcSStefano Zampini   }
7139ae82921SPaul Mullowney   PetscFunctionReturn(0);
7149ae82921SPaul Mullowney }
7159ae82921SPaul Mullowney 
716087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
717087f3262SPaul Mullowney {
718087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
719087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
720aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
721aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
722087f3262SPaul Mullowney   cusparseStatus_t                  stat;
723087f3262SPaul Mullowney   PetscErrorCode                    ierr;
72457d48284SJunchao Zhang   cudaError_t                       cerr;
725087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
726087f3262SPaul Mullowney   PetscScalar                       *AAUp;
727087f3262SPaul Mullowney   PetscScalar                       *AALo;
728087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
729087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
730087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
731087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
732087f3262SPaul Mullowney 
733087f3262SPaul Mullowney   PetscFunctionBegin;
734cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
735c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
736087f3262SPaul Mullowney     try {
737da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
738da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
739da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
740087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
74157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
74257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
743087f3262SPaul Mullowney 
744087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
745087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
746087f3262SPaul Mullowney         AiUp[n]=nzUpper;
747087f3262SPaul Mullowney         offset = 0;
748087f3262SPaul Mullowney         for (i=0; i<n; i++) {
749087f3262SPaul Mullowney           /* set the pointers */
750087f3262SPaul Mullowney           v  = aa + ai[i];
751087f3262SPaul Mullowney           vj = aj + ai[i];
752087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
753087f3262SPaul Mullowney 
754087f3262SPaul Mullowney           /* first, set the diagonal elements */
755087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
75609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
757087f3262SPaul Mullowney           AiUp[i]      = offset;
75809f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
759087f3262SPaul Mullowney 
760087f3262SPaul Mullowney           offset+=1;
761087f3262SPaul Mullowney           if (nz>0) {
762f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
763580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
764087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
765087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
766087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
767087f3262SPaul Mullowney             }
768087f3262SPaul Mullowney             offset+=nz;
769087f3262SPaul Mullowney           }
770087f3262SPaul Mullowney         }
771087f3262SPaul Mullowney 
772aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
773da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
774da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
775087f3262SPaul Mullowney 
776aa372e3fSPaul Mullowney         /* Create the matrix description */
77757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
77857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7791b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
780afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
781afb2bd1cSJunchao Zhang        #else
78257d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
783afb2bd1cSJunchao Zhang        #endif
78457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
78557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
786087f3262SPaul Mullowney 
787aa372e3fSPaul Mullowney         /* set the matrix */
788aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
789aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
790aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
791aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
792aa372e3fSPaul Mullowney 
793aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
794aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
795aa372e3fSPaul Mullowney 
796aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
797aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
798aa372e3fSPaul Mullowney 
799aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
800aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
801aa372e3fSPaul Mullowney 
802afb2bd1cSJunchao Zhang         /* set the operation */
803afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
804afb2bd1cSJunchao Zhang 
805afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
806da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
807afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8081b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
810afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
811afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
812afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
813afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
814afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
815afb2bd1cSJunchao Zhang       #endif
816afb2bd1cSJunchao Zhang 
817aa372e3fSPaul Mullowney         /* perform the solve analysis */
818aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
819aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
820aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
821d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8221b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
824d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
825d49cd2b7SBarry Smith                                 #else
826d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
827afb2bd1cSJunchao Zhang                                 #endif
828da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
829da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
830aa372e3fSPaul Mullowney 
831da79fbbcSStefano Zampini         /* assign the pointer */
832aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
833aa372e3fSPaul Mullowney 
834aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
835da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
836da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney         /* Create the matrix description */
83957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
84057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8411b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
842afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
843afb2bd1cSJunchao Zhang        #else
84457d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
845afb2bd1cSJunchao Zhang        #endif
84657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
84757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
848aa372e3fSPaul Mullowney 
849aa372e3fSPaul Mullowney         /* set the operation */
850aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
851aa372e3fSPaul Mullowney 
852aa372e3fSPaul Mullowney         /* set the matrix */
853aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
854aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
855aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
856aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
857aa372e3fSPaul Mullowney 
858aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
859aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
860aa372e3fSPaul Mullowney 
861aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
862aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
863aa372e3fSPaul Mullowney 
864aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
865aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
866aa372e3fSPaul Mullowney 
867afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
868da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
869afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8701b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
872afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
873afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
874afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
875afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
876afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
877afb2bd1cSJunchao Zhang       #endif
878afb2bd1cSJunchao Zhang 
879aa372e3fSPaul Mullowney         /* perform the solve analysis */
880aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
881aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
882aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
883d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
8841b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
885d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
886d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
887d49cd2b7SBarry Smith                                 #else
888d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
889afb2bd1cSJunchao Zhang                                 #endif
890da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
891da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
892aa372e3fSPaul Mullowney 
893da79fbbcSStefano Zampini         /* assign the pointer */
894aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
895087f3262SPaul Mullowney 
896da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
89757d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
89857d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
899da79fbbcSStefano Zampini       } else {
900da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
901da79fbbcSStefano Zampini         offset = 0;
902da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
903da79fbbcSStefano Zampini           /* set the pointers */
904da79fbbcSStefano Zampini           v  = aa + ai[i];
905da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
906da79fbbcSStefano Zampini 
907da79fbbcSStefano Zampini           /* first, set the diagonal elements */
908da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
909da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
910da79fbbcSStefano Zampini 
911da79fbbcSStefano Zampini           offset+=1;
912da79fbbcSStefano Zampini           if (nz>0) {
913da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
914da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
915da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
916da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
917da79fbbcSStefano Zampini             }
918da79fbbcSStefano Zampini             offset+=nz;
919da79fbbcSStefano Zampini           }
920da79fbbcSStefano Zampini         }
921da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
922da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
923da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
924da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
925da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
926da79fbbcSStefano Zampini       }
92757d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
92857d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
929087f3262SPaul Mullowney     } catch(char *ex) {
930087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
931087f3262SPaul Mullowney     }
932087f3262SPaul Mullowney   }
933087f3262SPaul Mullowney   PetscFunctionReturn(0);
934087f3262SPaul Mullowney }
935087f3262SPaul Mullowney 
936087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9379ae82921SPaul Mullowney {
9389ae82921SPaul Mullowney   PetscErrorCode               ierr;
939087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
940087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
941087f3262SPaul Mullowney   IS                           ip = a->row;
942087f3262SPaul Mullowney   PetscBool                    perm_identity;
943087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
944087f3262SPaul Mullowney 
945087f3262SPaul Mullowney   PetscFunctionBegin;
946da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
948da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
949aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
950aa372e3fSPaul Mullowney 
951da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
952da79fbbcSStefano Zampini 
953087f3262SPaul Mullowney   /* lower triangular indices */
954087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
955087f3262SPaul Mullowney   if (!perm_identity) {
9564e4bbfaaSStefano Zampini     IS             iip;
957da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9584e4bbfaaSStefano Zampini 
9594e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9604e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
961da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
962aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
963aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
964aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9654e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9664e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9674e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
968087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
969da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
970da79fbbcSStefano Zampini   }
971087f3262SPaul Mullowney   PetscFunctionReturn(0);
972087f3262SPaul Mullowney }
973087f3262SPaul Mullowney 
974087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
975087f3262SPaul Mullowney {
976087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
977087f3262SPaul Mullowney   IS             ip = b->row;
978087f3262SPaul Mullowney   PetscBool      perm_identity;
979b175d8bbSPaul Mullowney   PetscErrorCode ierr;
980087f3262SPaul Mullowney 
981087f3262SPaul Mullowney   PetscFunctionBegin;
98257181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
983087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
984ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
985087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
986087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
987087f3262SPaul Mullowney   if (perm_identity) {
988087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
989087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9904e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9914e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
992087f3262SPaul Mullowney   } else {
993087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
994087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9954e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9964e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
997087f3262SPaul Mullowney   }
998087f3262SPaul Mullowney 
999087f3262SPaul Mullowney   /* get the triangular factors */
1000087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1001087f3262SPaul Mullowney   PetscFunctionReturn(0);
1002087f3262SPaul Mullowney }
10039ae82921SPaul Mullowney 
1004b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1005bda325fcSPaul Mullowney {
1006bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1007aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1008aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1009da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1010da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1011bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1012aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1013aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1014aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1015aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10161b0a6780SStefano Zampini   cudaError_t                       cerr;
1017da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1018b175d8bbSPaul Mullowney 
1019bda325fcSPaul Mullowney   PetscFunctionBegin;
1020aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1021da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1022da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1025aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1026aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1027aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1028aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1029aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1030aa372e3fSPaul Mullowney 
1031aa372e3fSPaul Mullowney   /* Create the matrix description */
103257d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
103357d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
103457d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
103557d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
103657d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1037aa372e3fSPaul Mullowney 
1038aa372e3fSPaul Mullowney   /* set the operation */
1039aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1040aa372e3fSPaul Mullowney 
1041aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1042aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1043afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1044afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1045aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1046afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1047afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1048afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1049aa372e3fSPaul Mullowney 
1050aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1051afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1053afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1054afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1055afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1056afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1057afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1058afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1060afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10611b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1062afb2bd1cSJunchao Zhang #endif
1063afb2bd1cSJunchao Zhang 
1064da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1065aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1066aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1067aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1068aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1069aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1070aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1071afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1072afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1073afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1074d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1075afb2bd1cSJunchao Zhang                         #else
1076afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1078afb2bd1cSJunchao Zhang                         #endif
1079da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1080da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1081aa372e3fSPaul Mullowney 
1082afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1083da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1087afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1088afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1089afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1090afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1091afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1092afb2bd1cSJunchao Zhang #endif
1093afb2bd1cSJunchao Zhang 
1094afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1095aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1096afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1097afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1098d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
10991b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1100d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1101d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1102d49cd2b7SBarry Smith                           #else
1103d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1104afb2bd1cSJunchao Zhang                           #endif
1105da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1106da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1107aa372e3fSPaul Mullowney 
1108da79fbbcSStefano Zampini   /* assign the pointer */
1109aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /*********************************************/
1112aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1113aa372e3fSPaul Mullowney   /*********************************************/
1114aa372e3fSPaul Mullowney 
1115aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1116da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1117da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1118aa372e3fSPaul Mullowney 
1119aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1120aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1121aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1122aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1123aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1124aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1125aa372e3fSPaul Mullowney 
1126aa372e3fSPaul Mullowney   /* Create the matrix description */
112757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112957d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
113057d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
113157d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /* set the operation */
1134aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1135aa372e3fSPaul Mullowney 
1136aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1137aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1138afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1139afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1140aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1141afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1142afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1143afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1144aa372e3fSPaul Mullowney 
1145aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1146afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1148afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1149afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1150afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1151afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1152afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1153afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1155afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1156afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1157afb2bd1cSJunchao Zhang #endif
1158afb2bd1cSJunchao Zhang 
1159da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1160aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1161aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1162aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1163aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1164aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1165aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1166afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1167afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1168afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1169d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1170afb2bd1cSJunchao Zhang                         #else
1171afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1173afb2bd1cSJunchao Zhang                         #endif
1174d49cd2b7SBarry Smith 
1175da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1176da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1177aa372e3fSPaul Mullowney 
1178afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1179da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1180afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11811b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1183afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1184afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1185afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1186afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1187afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1188afb2bd1cSJunchao Zhang   #endif
1189afb2bd1cSJunchao Zhang 
1190afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1191aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1192afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1193afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1194d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
11951b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1196d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1197d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1198d49cd2b7SBarry Smith                           #else
1199d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1200afb2bd1cSJunchao Zhang                           #endif
1201d49cd2b7SBarry Smith 
1202da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1203da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1204aa372e3fSPaul Mullowney 
1205da79fbbcSStefano Zampini   /* assign the pointer */
1206aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1207bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1208bda325fcSPaul Mullowney }
1209bda325fcSPaul Mullowney 
1210a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1211a49f1ed0SStefano Zampini {
1212a49f1ed0SStefano Zampini   __host__ __device__
1213a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1214a49f1ed0SStefano Zampini   {
1215a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1216a49f1ed0SStefano Zampini   }
1217a49f1ed0SStefano Zampini };
1218a49f1ed0SStefano Zampini 
12193606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1220bda325fcSPaul Mullowney {
1221aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1222a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1223bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1224bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1225aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1226b06137fdSPaul Mullowney   cudaError_t                  err;
122785ba7357SStefano Zampini   PetscErrorCode               ierr;
1228b175d8bbSPaul Mullowney 
1229bda325fcSPaul Mullowney   PetscFunctionBegin;
1230a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1231a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1232e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1233a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1234e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12351a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
123685ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1237ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1238a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1239a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1240a49f1ed0SStefano Zampini   }
1241a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1242aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
124357d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1244aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
124557d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
124657d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1247aa372e3fSPaul Mullowney 
1248b06137fdSPaul Mullowney     /* set alpha and beta */
1249afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12507656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12517656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1252afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12537656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12547656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1255b06137fdSPaul Mullowney 
1256aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1257aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1258a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1259554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1260554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1261aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1262a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1263aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1264aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1265a3fdcf43SKarl Rupp 
1266039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
126781902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1268afb2bd1cSJunchao Zhang 
1269afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12703606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1271afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1272afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1273afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1274afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1275afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1276afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
12773606e59fSJunchao Zhang       #else
12783606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12793606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12803606e59fSJunchao Zhang 
12813606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12823606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12833606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12843606e59fSJunchao Zhang         */
12853606e59fSJunchao Zhang         if (matrixT->num_entries) {
12863606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
12873606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
12883606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
12893606e59fSJunchao Zhang                                  matrixT->values->data().get(),
12903606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
12913606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
12923606e59fSJunchao Zhang 
12933606e59fSJunchao Zhang         } else {
12943606e59fSJunchao Zhang           matstructT->matDescr = NULL;
12953606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12963606e59fSJunchao Zhang         }
12973606e59fSJunchao Zhang       #endif
1298afb2bd1cSJunchao Zhang      #endif
1299aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1300afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1301afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1302afb2bd1cSJunchao Zhang    #else
1303aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
130451c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
130551c6d536SStefano Zampini       /* First convert HYB to CSR */
1306aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1307aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1308aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1309aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1310aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1311aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1312aa372e3fSPaul Mullowney 
1313aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1314aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1315aa372e3fSPaul Mullowney                               temp->values->data().get(),
1316aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
131757d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1318aa372e3fSPaul Mullowney 
1319aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1320aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1321aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1322aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1323aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1324aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1325aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1326aa372e3fSPaul Mullowney 
1327aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1328aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1329aa372e3fSPaul Mullowney                               temp->values->data().get(),
1330aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1331aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1332aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1333aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1334aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
133557d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1336aa372e3fSPaul Mullowney 
1337aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1338aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
133957d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1340aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1341aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1342aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1343aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1344aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1345aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
134657d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1347aa372e3fSPaul Mullowney 
1348aa372e3fSPaul Mullowney       /* assign the pointer */
1349aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13501a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1351aa372e3fSPaul Mullowney       /* delete temporaries */
1352aa372e3fSPaul Mullowney       if (tempT) {
1353aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1354aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1355aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1356aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1357087f3262SPaul Mullowney       }
1358aa372e3fSPaul Mullowney       if (temp) {
1359aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1360aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1361aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1362aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1363aa372e3fSPaul Mullowney       }
1364afb2bd1cSJunchao Zhang      #endif
1365aa372e3fSPaul Mullowney     }
1366a49f1ed0SStefano Zampini   }
1367a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1368a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1369a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1370e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1371e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1372e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1373e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1374e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1375e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1376e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1377e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1378a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1379a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1380a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1381a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1382a49f1ed0SStefano Zampini     }
1383a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1384a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1385a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1386a49f1ed0SStefano Zampini 
1387a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1388a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1390a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1391a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1392a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1393a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1394a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1395a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1396a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1397a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1399a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1400a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1401a49f1ed0SStefano Zampini      #endif
1402a49f1ed0SStefano Zampini 
14031a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14041a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14051a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14061a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14071a2c6b5cSJunchao Zhang 
14081a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14091a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14101a2c6b5cSJunchao Zhang         */
14111a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14121a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14131a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14141a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14151a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1416a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1417a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1418a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1419a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14201a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1421a49f1ed0SStefano Zampini                              #else
1422a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14231a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1424a49f1ed0SStefano Zampini                              #endif
14251a2c6b5cSJunchao Zhang       } else {
14261a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14271a2c6b5cSJunchao Zhang       }
14281a2c6b5cSJunchao Zhang 
1429a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1430a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1431a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1432a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1433a49f1ed0SStefano Zampini      #endif
1434a49f1ed0SStefano Zampini     }
1435a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1436a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1437a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1438a49f1ed0SStefano Zampini   }
1439ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
144085ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1441213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1442213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1443aa372e3fSPaul Mullowney   /* assign the pointer */
1444aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14451a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1446bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1447bda325fcSPaul Mullowney }
1448bda325fcSPaul Mullowney 
1449a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14506fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1451bda325fcSPaul Mullowney {
1452c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1453465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1454465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1455465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1456465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1457bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1458bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1459aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1462b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1463bda325fcSPaul Mullowney 
1464bda325fcSPaul Mullowney   PetscFunctionBegin;
1465aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1466aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1467bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1468aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1469aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1470bda325fcSPaul Mullowney   }
1471bda325fcSPaul Mullowney 
1472bda325fcSPaul Mullowney   /* Get the GPU pointers */
1473c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1474c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1475c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1476c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1477bda325fcSPaul Mullowney 
14787a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1479aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1480a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1481c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1482c41cb2e2SAlejandro Lamas Daviña                xGPU);
1483aa372e3fSPaul Mullowney 
1484aa372e3fSPaul Mullowney   /* First, solve U */
1485aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1486afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14871b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1488afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1489afb2bd1cSJunchao Zhang                       #endif
1490afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1491aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1492aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1493aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1494aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1495d49cd2b7SBarry Smith                         xarray,
14961b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1497d49cd2b7SBarry Smith                         tempGPU->data().get(),
1498d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1499d49cd2b7SBarry Smith                       #else
1500d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1501afb2bd1cSJunchao Zhang                       #endif
1502aa372e3fSPaul Mullowney 
1503aa372e3fSPaul Mullowney   /* Then, solve L */
1504aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1505afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1507afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1508afb2bd1cSJunchao Zhang                       #endif
1509afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1510aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1511aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1512aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1513aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1514d49cd2b7SBarry Smith                         tempGPU->data().get(),
15151b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1516d49cd2b7SBarry Smith                         xarray,
1517d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1518d49cd2b7SBarry Smith                       #else
1519d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1520afb2bd1cSJunchao Zhang                       #endif
1521aa372e3fSPaul Mullowney 
1522aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1523a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1524c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1525aa372e3fSPaul Mullowney                tempGPU->begin());
1526aa372e3fSPaul Mullowney 
1527aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1528a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1529bda325fcSPaul Mullowney 
1530bda325fcSPaul Mullowney   /* restore */
1531c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1532c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1533661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1534958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1535bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1536bda325fcSPaul Mullowney }
1537bda325fcSPaul Mullowney 
15386fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1539bda325fcSPaul Mullowney {
1540465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1541465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1542bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1543bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1544aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1545aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1546aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1547b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1548bda325fcSPaul Mullowney 
1549bda325fcSPaul Mullowney   PetscFunctionBegin;
1550aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1551aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1552bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1553aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1554aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1555bda325fcSPaul Mullowney   }
1556bda325fcSPaul Mullowney 
1557bda325fcSPaul Mullowney   /* Get the GPU pointers */
1558c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1559c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1560bda325fcSPaul Mullowney 
15617a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1562aa372e3fSPaul Mullowney   /* First, solve U */
1563aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1564afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1566afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1567afb2bd1cSJunchao Zhang                       #endif
1568afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1569aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1570aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1571aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1572aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1573d49cd2b7SBarry Smith                         barray,
15741b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1575d49cd2b7SBarry Smith                         tempGPU->data().get(),
1576d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1577d49cd2b7SBarry Smith                       #else
1578d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1579afb2bd1cSJunchao Zhang                       #endif
1580aa372e3fSPaul Mullowney 
1581aa372e3fSPaul Mullowney   /* Then, solve L */
1582aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1583afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1585afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1586afb2bd1cSJunchao Zhang                       #endif
1587afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1588aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1589aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1590aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1591aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1592d49cd2b7SBarry Smith                         tempGPU->data().get(),
15931b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1594d49cd2b7SBarry Smith                         xarray,
1595d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1596d49cd2b7SBarry Smith                       #else
1597d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1598afb2bd1cSJunchao Zhang                       #endif
1599bda325fcSPaul Mullowney 
1600bda325fcSPaul Mullowney   /* restore */
1601c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1602c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1604958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1605bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1606bda325fcSPaul Mullowney }
1607bda325fcSPaul Mullowney 
16086fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16099ae82921SPaul Mullowney {
1610465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1611465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1612465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1613465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16149ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16159ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1616aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1617aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1618aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1619b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16209ae82921SPaul Mullowney 
16219ae82921SPaul Mullowney   PetscFunctionBegin;
1622ebc8f436SDominic Meiser 
1623e057df02SPaul Mullowney   /* Get the GPU pointers */
1624c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1625c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1626c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1627c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16289ae82921SPaul Mullowney 
16297a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1630aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1631a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1632c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16334e4bbfaaSStefano Zampini                tempGPU->begin());
1634aa372e3fSPaul Mullowney 
1635aa372e3fSPaul Mullowney   /* Next, solve L */
1636aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1637afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16381b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1639afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1640afb2bd1cSJunchao Zhang                       #endif
1641afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1642aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1643aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1644aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1645aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1646d49cd2b7SBarry Smith                         tempGPU->data().get(),
16471b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1648d49cd2b7SBarry Smith                          xarray,
1649d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1650d49cd2b7SBarry Smith                       #else
1651d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1652afb2bd1cSJunchao Zhang                       #endif
1653aa372e3fSPaul Mullowney 
1654aa372e3fSPaul Mullowney   /* Then, solve U */
1655aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1656afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16571b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1658afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1659afb2bd1cSJunchao Zhang                       #endif
1660afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1661aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1662aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1663aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1664d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1666d49cd2b7SBarry Smith                         tempGPU->data().get(),
1667d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1668d49cd2b7SBarry Smith                       #else
1669d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1670afb2bd1cSJunchao Zhang                       #endif
1671d49cd2b7SBarry Smith 
16724e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1673a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16744e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16754e4bbfaaSStefano Zampini                xGPU);
16769ae82921SPaul Mullowney 
1677c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1679661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1680958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16819ae82921SPaul Mullowney   PetscFunctionReturn(0);
16829ae82921SPaul Mullowney }
16839ae82921SPaul Mullowney 
16846fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16859ae82921SPaul Mullowney {
1686465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1687465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16889ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16899ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1690aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1691aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1692aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1693b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
16949ae82921SPaul Mullowney 
16959ae82921SPaul Mullowney   PetscFunctionBegin;
1696e057df02SPaul Mullowney   /* Get the GPU pointers */
1697c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1698c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16999ae82921SPaul Mullowney 
17007a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1701aa372e3fSPaul Mullowney   /* First, solve L */
1702aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1703afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1705afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1706afb2bd1cSJunchao Zhang                       #endif
1707afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1708aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1709aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1710aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1711aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1712d49cd2b7SBarry Smith                         barray,
17131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1714d49cd2b7SBarry Smith                         tempGPU->data().get(),
1715d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1716d49cd2b7SBarry Smith                       #else
1717d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1718afb2bd1cSJunchao Zhang                       #endif
1719d49cd2b7SBarry Smith 
1720aa372e3fSPaul Mullowney   /* Next, solve U */
1721aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1722afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1724afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1725afb2bd1cSJunchao Zhang                       #endif
1726afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1727aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1728aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1729aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1730aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1731d49cd2b7SBarry Smith                         tempGPU->data().get(),
17321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1733d49cd2b7SBarry Smith                         xarray,
1734d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1735d49cd2b7SBarry Smith                       #else
1736d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1737afb2bd1cSJunchao Zhang                       #endif
17389ae82921SPaul Mullowney 
1739c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1740c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1741661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1742958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17439ae82921SPaul Mullowney   PetscFunctionReturn(0);
17449ae82921SPaul Mullowney }
17459ae82921SPaul Mullowney 
17467e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17477e8381f9SStefano Zampini {
17487e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17497e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17507e8381f9SStefano Zampini   cudaError_t        cerr;
17517e8381f9SStefano Zampini   PetscErrorCode     ierr;
17527e8381f9SStefano Zampini 
17537e8381f9SStefano Zampini   PetscFunctionBegin;
17547e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17557e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17567e8381f9SStefano Zampini 
17577e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17587e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17597e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17607e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17617e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17627e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17637e8381f9SStefano Zampini   }
17647e8381f9SStefano Zampini   PetscFunctionReturn(0);
17657e8381f9SStefano Zampini }
17667e8381f9SStefano Zampini 
17677e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17687e8381f9SStefano Zampini {
17697e8381f9SStefano Zampini   PetscErrorCode ierr;
17707e8381f9SStefano Zampini 
17717e8381f9SStefano Zampini   PetscFunctionBegin;
17727e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1773*67a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
1774*67a45760SJunchao Zhang   PetscFunctionReturn(0);
1775*67a45760SJunchao Zhang }
1776*67a45760SJunchao Zhang 
1777*67a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1778*67a45760SJunchao Zhang {
1779*67a45760SJunchao Zhang   PetscFunctionBegin;
17807e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
1781*67a45760SJunchao Zhang   *array         = NULL;
1782*67a45760SJunchao Zhang   PetscFunctionReturn(0);
1783*67a45760SJunchao Zhang }
1784*67a45760SJunchao Zhang 
1785*67a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1786*67a45760SJunchao Zhang {
1787*67a45760SJunchao Zhang   PetscErrorCode ierr;
1788*67a45760SJunchao Zhang 
1789*67a45760SJunchao Zhang   PetscFunctionBegin;
1790*67a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1791*67a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
1792*67a45760SJunchao Zhang   PetscFunctionReturn(0);
1793*67a45760SJunchao Zhang }
1794*67a45760SJunchao Zhang 
1795*67a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
1796*67a45760SJunchao Zhang {
1797*67a45760SJunchao Zhang   PetscFunctionBegin;
1798*67a45760SJunchao Zhang   *array = NULL;
1799*67a45760SJunchao Zhang   PetscFunctionReturn(0);
1800*67a45760SJunchao Zhang }
1801*67a45760SJunchao Zhang 
1802*67a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1803*67a45760SJunchao Zhang {
1804*67a45760SJunchao Zhang   PetscFunctionBegin;
1805*67a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
1806*67a45760SJunchao Zhang   PetscFunctionReturn(0);
1807*67a45760SJunchao Zhang }
1808*67a45760SJunchao Zhang 
1809*67a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1810*67a45760SJunchao Zhang {
1811*67a45760SJunchao Zhang   PetscFunctionBegin;
1812*67a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
1813*67a45760SJunchao Zhang   *array         = NULL;
18147e8381f9SStefano Zampini   PetscFunctionReturn(0);
18157e8381f9SStefano Zampini }
18167e8381f9SStefano Zampini 
1817042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18189ae82921SPaul Mullowney {
1819aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18207c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1822213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18239ae82921SPaul Mullowney   PetscErrorCode               ierr;
1824aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1825abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1826b06137fdSPaul Mullowney   cudaError_t                  err;
18279ae82921SPaul Mullowney 
18289ae82921SPaul Mullowney   PetscFunctionBegin;
1829e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1830c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1831a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1832a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1833afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
183485ba7357SStefano Zampini 
1835e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
183685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1837afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
183805035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18394863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
184085ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1841a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
184234d6c7a5SJose E. Roman     } else {
1843abb89eb1SStefano Zampini       PetscInt nnz;
184485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18457c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1846a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18477c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
184881902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1849a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1850a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18519ae82921SPaul Mullowney       try {
18529ae82921SPaul Mullowney         if (a->compressedrow.use) {
18539ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18549ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18559ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18569ae82921SPaul Mullowney         } else {
1857213423ffSJunchao Zhang           m    = A->rmap->n;
1858213423ffSJunchao Zhang           ii   = a->i;
1859e6e9a74fSStefano Zampini           ridx = NULL;
18609ae82921SPaul Mullowney         }
1861e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1862e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1863abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1864abb89eb1SStefano Zampini         else nnz = a->nz;
18659ae82921SPaul Mullowney 
186685ba7357SStefano Zampini         /* create cusparse matrix */
1867abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1868aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
186957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
187057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
187157d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
18729ae82921SPaul Mullowney 
1873afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
18747656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
18757656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1876afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18777656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18787656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
187957d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1880b06137fdSPaul Mullowney 
1881aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1882aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1883aa372e3fSPaul Mullowney           /* set the matrix */
1884afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1885afb2bd1cSJunchao Zhang           mat->num_rows = m;
1886afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1887abb89eb1SStefano Zampini           mat->num_entries = nnz;
1888afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1889afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18909ae82921SPaul Mullowney 
1891abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1892abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1893aa372e3fSPaul Mullowney 
1894abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1895abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1896aa372e3fSPaul Mullowney 
1897aa372e3fSPaul Mullowney           /* assign the pointer */
1898afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1899afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1900afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1901afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1902afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1903afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1904afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1905afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1906afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1907afb2bd1cSJunchao Zhang           }
1908afb2bd1cSJunchao Zhang          #endif
1909aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1910afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1911afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1912afb2bd1cSJunchao Zhang          #else
1913afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1914afb2bd1cSJunchao Zhang           mat->num_rows = m;
1915afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1916abb89eb1SStefano Zampini           mat->num_entries = nnz;
1917afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1918afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1919aa372e3fSPaul Mullowney 
1920abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1921abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1922aa372e3fSPaul Mullowney 
1923abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1924abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1925aa372e3fSPaul Mullowney 
1926aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
192757d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1928aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1929aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1930afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1931afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1932afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1933afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
193457d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1935aa372e3fSPaul Mullowney           /* assign the pointer */
1936aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1937aa372e3fSPaul Mullowney 
1938afb2bd1cSJunchao Zhang           if (mat) {
1939afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1940afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1941afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1942afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1943087f3262SPaul Mullowney           }
1944afb2bd1cSJunchao Zhang          #endif
1945087f3262SPaul Mullowney         }
1946ca45077fSPaul Mullowney 
1947aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1948213423ffSJunchao Zhang         if (a->compressedrow.use) {
1949213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1950aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1951aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1952213423ffSJunchao Zhang           tmp = m;
1953213423ffSJunchao Zhang         } else {
1954213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1955213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1956213423ffSJunchao Zhang           tmp = 0;
1957213423ffSJunchao Zhang         }
1958213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1959aa372e3fSPaul Mullowney 
1960aa372e3fSPaul Mullowney         /* assign the pointer */
1961aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19629ae82921SPaul Mullowney       } catch(char *ex) {
19639ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19649ae82921SPaul Mullowney       }
196505035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
196685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
196734d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
196834d6c7a5SJose E. Roman     }
1969abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19709ae82921SPaul Mullowney   }
19719ae82921SPaul Mullowney   PetscFunctionReturn(0);
19729ae82921SPaul Mullowney }
19739ae82921SPaul Mullowney 
1974c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1975aa372e3fSPaul Mullowney {
1976aa372e3fSPaul Mullowney   template <typename Tuple>
1977aa372e3fSPaul Mullowney   __host__ __device__
1978aa372e3fSPaul Mullowney   void operator()(Tuple t)
1979aa372e3fSPaul Mullowney   {
1980aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1981aa372e3fSPaul Mullowney   }
1982aa372e3fSPaul Mullowney };
1983aa372e3fSPaul Mullowney 
19847e8381f9SStefano Zampini struct VecCUDAEquals
19857e8381f9SStefano Zampini {
19867e8381f9SStefano Zampini   template <typename Tuple>
19877e8381f9SStefano Zampini   __host__ __device__
19887e8381f9SStefano Zampini   void operator()(Tuple t)
19897e8381f9SStefano Zampini   {
19907e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19917e8381f9SStefano Zampini   }
19927e8381f9SStefano Zampini };
19937e8381f9SStefano Zampini 
1994e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1995e6e9a74fSStefano Zampini {
1996e6e9a74fSStefano Zampini   template <typename Tuple>
1997e6e9a74fSStefano Zampini   __host__ __device__
1998e6e9a74fSStefano Zampini   void operator()(Tuple t)
1999e6e9a74fSStefano Zampini   {
2000e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2001e6e9a74fSStefano Zampini   }
2002e6e9a74fSStefano Zampini };
2003e6e9a74fSStefano Zampini 
2004afb2bd1cSJunchao Zhang struct MatMatCusparse {
2005ccdfe979SStefano Zampini   PetscBool             cisdense;
2006ccdfe979SStefano Zampini   PetscScalar           *Bt;
2007ccdfe979SStefano Zampini   Mat                   X;
2008fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2009fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2010fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2011b4285af6SJunchao Zhang 
2012afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2013fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2014afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2015afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2016afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2017afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2018b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2019b4285af6SJunchao Zhang   void                  *dBuffer4;
2020b4285af6SJunchao Zhang   void                  *dBuffer5;
2021b4285af6SJunchao Zhang  #endif
2022fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2023fcdce8c4SStefano Zampini   void                  *mmBuffer;
2024fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2025fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2026afb2bd1cSJunchao Zhang #endif
2027afb2bd1cSJunchao Zhang };
2028ccdfe979SStefano Zampini 
2029ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2030ccdfe979SStefano Zampini {
2031ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2032ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2033ccdfe979SStefano Zampini   cudaError_t      cerr;
2034fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2035fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2036fcdce8c4SStefano Zampini  #endif
2037ccdfe979SStefano Zampini 
2038ccdfe979SStefano Zampini   PetscFunctionBegin;
2039ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2040fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2041afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2042fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2043afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2044afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2045fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2046b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2047b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2048b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2049b4285af6SJunchao Zhang  #endif
2050b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2051b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2052afb2bd1cSJunchao Zhang  #endif
2053ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2054ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2055ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2056ccdfe979SStefano Zampini }
2057ccdfe979SStefano Zampini 
2058ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2059ccdfe979SStefano Zampini 
2060ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2061ccdfe979SStefano Zampini {
2062ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2063ccdfe979SStefano Zampini   Mat                          A,B;
2064afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2065ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2066ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2067ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2068ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2069ccdfe979SStefano Zampini   const PetscScalar            *barray;
2070ccdfe979SStefano Zampini   PetscScalar                  *carray;
2071ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2072ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2073ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2075ccdfe979SStefano Zampini 
2076ccdfe979SStefano Zampini   PetscFunctionBegin;
2077ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2078e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2080ccdfe979SStefano Zampini   A    = product->A;
2081ccdfe979SStefano Zampini   B    = product->B;
2082ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2083e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2087ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2088ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089ccdfe979SStefano Zampini   switch (product->type) {
2090ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2091ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2092ccdfe979SStefano Zampini     mat = cusp->mat;
2093ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094ccdfe979SStefano Zampini     m   = A->rmap->n;
2095ccdfe979SStefano Zampini     n   = B->cmap->n;
2096ccdfe979SStefano Zampini     break;
2097ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20981a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2099e6e9a74fSStefano Zampini       mat = cusp->mat;
2100e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101e6e9a74fSStefano Zampini     } else {
21023606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2103ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2104ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105e6e9a74fSStefano Zampini     }
2106ccdfe979SStefano Zampini     m = A->cmap->n;
2107ccdfe979SStefano Zampini     n = B->cmap->n;
2108ccdfe979SStefano Zampini     break;
2109ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2110ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2111ccdfe979SStefano Zampini     mat = cusp->mat;
2112ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113ccdfe979SStefano Zampini     m   = A->rmap->n;
2114ccdfe979SStefano Zampini     n   = B->rmap->n;
2115ccdfe979SStefano Zampini     break;
2116ccdfe979SStefano Zampini   default:
2117e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118ccdfe979SStefano Zampini   }
2119e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2121ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2122ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2123afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2124ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2125afb2bd1cSJunchao Zhang 
2126ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2127c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2128c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2129c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2130c8378d12SStefano Zampini   } else {
2131c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2132c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2133c8378d12SStefano Zampini   }
2134c8378d12SStefano Zampini 
2135c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2136afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140fcdce8c4SStefano Zampini     size_t mmBufferSize;
2141afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2142afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2143afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2144afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2145afb2bd1cSJunchao Zhang     }
2146c8378d12SStefano Zampini 
2147afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2148afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2149afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2150afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2151afb2bd1cSJunchao Zhang     }
2152afb2bd1cSJunchao Zhang 
2153afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2154afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2155afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2158afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2160afb2bd1cSJunchao Zhang     }
2161afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2164fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2165fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2166ee7b52eaSHong Zhang       cudaError_t cerr;
2167fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2168fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2169fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2170fcdce8c4SStefano Zampini     }
2171afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2172afb2bd1cSJunchao Zhang   } else {
2173afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2174afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2175afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2176afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2177afb2bd1cSJunchao Zhang   }
2178afb2bd1cSJunchao Zhang 
2179afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2180afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2181afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2182afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2183fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2184afb2bd1cSJunchao Zhang  #else
2185afb2bd1cSJunchao Zhang   PetscInt k;
2186afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2187ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2188ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2189ccdfe979SStefano Zampini     cublasStatus_t cerr;
2190ccdfe979SStefano Zampini 
2191ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2192ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2193ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2194ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2195ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2196ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2197ccdfe979SStefano Zampini     blda = B->cmap->n;
2198afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2199afb2bd1cSJunchao Zhang   } else {
2200afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2201ccdfe979SStefano Zampini   }
2202ccdfe979SStefano Zampini 
2203afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2204ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2205afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2206ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2207ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2208ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2209ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2210ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2211afb2bd1cSJunchao Zhang  #endif
2212c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2213c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2214ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2215ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2216ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2217ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2218ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2219ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2220ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2221ccdfe979SStefano Zampini   } else {
2222ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2223ccdfe979SStefano Zampini   }
2224ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2225ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2226ccdfe979SStefano Zampini   }
2227ccdfe979SStefano Zampini   if (!biscuda) {
2228ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2229ccdfe979SStefano Zampini   }
2230ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2231ccdfe979SStefano Zampini }
2232ccdfe979SStefano Zampini 
2233ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2234ccdfe979SStefano Zampini {
2235ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2236ccdfe979SStefano Zampini   Mat                A,B;
2237ccdfe979SStefano Zampini   PetscInt           m,n;
2238ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2239ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2240ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2241ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2242ccdfe979SStefano Zampini 
2243ccdfe979SStefano Zampini   PetscFunctionBegin;
2244ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2245e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2246ccdfe979SStefano Zampini   A    = product->A;
2247ccdfe979SStefano Zampini   B    = product->B;
2248ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2249e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2250ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2251e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2252ccdfe979SStefano Zampini   switch (product->type) {
2253ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2254ccdfe979SStefano Zampini     m = A->rmap->n;
2255ccdfe979SStefano Zampini     n = B->cmap->n;
2256ccdfe979SStefano Zampini     break;
2257ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2258ccdfe979SStefano Zampini     m = A->cmap->n;
2259ccdfe979SStefano Zampini     n = B->cmap->n;
2260ccdfe979SStefano Zampini     break;
2261ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2262ccdfe979SStefano Zampini     m = A->rmap->n;
2263ccdfe979SStefano Zampini     n = B->rmap->n;
2264ccdfe979SStefano Zampini     break;
2265ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2266ccdfe979SStefano Zampini     m = B->cmap->n;
2267ccdfe979SStefano Zampini     n = B->cmap->n;
2268ccdfe979SStefano Zampini     break;
2269ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2270ccdfe979SStefano Zampini     m = B->rmap->n;
2271ccdfe979SStefano Zampini     n = B->rmap->n;
2272ccdfe979SStefano Zampini     break;
2273ccdfe979SStefano Zampini   default:
2274e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2275ccdfe979SStefano Zampini   }
2276ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2277ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2278ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2279ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2280ccdfe979SStefano Zampini 
2281ccdfe979SStefano Zampini   /* product data */
2282ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2283ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2284afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2285afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2286ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2287afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2288ccdfe979SStefano Zampini   }
2289afb2bd1cSJunchao Zhang  #endif
2290ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2291ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2292ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2293ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2294ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2295ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2296ccdfe979SStefano Zampini     } else {
2297ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2298ccdfe979SStefano Zampini     }
2299ccdfe979SStefano Zampini   }
2300ccdfe979SStefano Zampini   C->product->data    = mmdata;
2301ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2302ccdfe979SStefano Zampini 
2303ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2304ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2305ccdfe979SStefano Zampini }
2306ccdfe979SStefano Zampini 
2307fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2308ccdfe979SStefano Zampini {
2309ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2310fcdce8c4SStefano Zampini   Mat                          A,B;
2311fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2312fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2313fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2314fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2315fcdce8c4SStefano Zampini   PetscBool                    flg;
2316ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2317fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2318fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2319fcdce8c4SStefano Zampini   MatProductType               ptype;
2320fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2321fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2322fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2323fcdce8c4SStefano Zampini #endif
2324b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2325ccdfe979SStefano Zampini 
2326ccdfe979SStefano Zampini   PetscFunctionBegin;
2327ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2328e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2329fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2330e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2331fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2332fcdce8c4SStefano Zampini   A = product->A;
2333fcdce8c4SStefano Zampini   B = product->B;
2334fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2335fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2336fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2337e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2338fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2339e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2340fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2341e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2342fcdce8c4SStefano Zampini     goto finalize;
2343fcdce8c4SStefano Zampini   }
2344fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2345fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2346e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2347fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2348e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2349fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2350fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2351fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2352fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2353fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2354e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2355e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2356e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2357fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2358fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2359fcdce8c4SStefano Zampini 
2360fcdce8c4SStefano Zampini   ptype = product->type;
2361fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2362fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2363fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2364fa046f9fSJunchao Zhang   }
2365fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2366fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2367fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2368fa046f9fSJunchao Zhang   }
2369fcdce8c4SStefano Zampini   switch (ptype) {
2370fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2371fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2372fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2373fcdce8c4SStefano Zampini     break;
2374fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2375fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2376fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2377fcdce8c4SStefano Zampini     break;
2378fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2379fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2380fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2381fcdce8c4SStefano Zampini     break;
2382fcdce8c4SStefano Zampini   default:
2383e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2384fcdce8c4SStefano Zampini   }
2385fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2386e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2387e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2388e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2389fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2390fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2391fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2392e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2393e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2394e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2395fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2396fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2397fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2398b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2399b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2400b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2401b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2402b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2403b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2404b4285af6SJunchao Zhang   #else
2405b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2406fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2408fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2409b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2410fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2411fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2412b4285af6SJunchao Zhang   #endif
2413fcdce8c4SStefano Zampini #else
2414b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2415fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2416fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2417fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2418fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2419fcdce8c4SStefano Zampini #endif
2420fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2421fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2422fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2423fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2424fcdce8c4SStefano Zampini finalize:
2425fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2426fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2427fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2428fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2429fcdce8c4SStefano Zampini   c->reallocs         = 0;
2430fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2431fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2432fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2433fcdce8c4SStefano Zampini   C->num_ass++;
2434ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2435ccdfe979SStefano Zampini }
2436fcdce8c4SStefano Zampini 
2437fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2438fcdce8c4SStefano Zampini {
2439fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2440fcdce8c4SStefano Zampini   Mat                          A,B;
2441fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2442fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2443fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2444fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2445fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2446fcdce8c4SStefano Zampini   PetscBool                    flg;
2447fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2448fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2449fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2450fcdce8c4SStefano Zampini   MatProductType               ptype;
2451fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2452fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2453fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2454fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2455fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2456fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2457fcdce8c4SStefano Zampini #else
2458fcdce8c4SStefano Zampini   int                          cnz;
2459fcdce8c4SStefano Zampini #endif
2460b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2461fcdce8c4SStefano Zampini 
2462fcdce8c4SStefano Zampini   PetscFunctionBegin;
2463fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2464e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2465fcdce8c4SStefano Zampini   A    = product->A;
2466fcdce8c4SStefano Zampini   B    = product->B;
2467fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2468e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2469fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2470e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2471fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2472fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2473fcdce8c4SStefano Zampini   /* product data */
2474fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2475fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2476fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2477fcdce8c4SStefano Zampini 
2478fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2479fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2480d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2481d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2482d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2483d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2484d60bce21SJunchao Zhang 
2485fcdce8c4SStefano Zampini   ptype = product->type;
2486fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2487fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2488fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2489fa046f9fSJunchao Zhang   }
2490fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2491fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2492fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2493fa046f9fSJunchao Zhang   }
2494fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2495fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2496fcdce8c4SStefano Zampini   switch (ptype) {
2497fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2498fcdce8c4SStefano Zampini     m = A->rmap->n;
2499fcdce8c4SStefano Zampini     n = B->cmap->n;
2500fcdce8c4SStefano Zampini     k = A->cmap->n;
2501fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2502fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2503fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2504fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2505fcdce8c4SStefano Zampini     break;
2506fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2507fcdce8c4SStefano Zampini     m = A->cmap->n;
2508fcdce8c4SStefano Zampini     n = B->cmap->n;
2509fcdce8c4SStefano Zampini     k = A->rmap->n;
25103606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2511fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2512fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2513fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2514fcdce8c4SStefano Zampini     break;
2515fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2516fcdce8c4SStefano Zampini     m = A->rmap->n;
2517fcdce8c4SStefano Zampini     n = B->rmap->n;
2518fcdce8c4SStefano Zampini     k = A->cmap->n;
25193606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2520fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2521fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2522fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2523fcdce8c4SStefano Zampini     break;
2524fcdce8c4SStefano Zampini   default:
2525e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2526fcdce8c4SStefano Zampini   }
2527fcdce8c4SStefano Zampini 
2528fcdce8c4SStefano Zampini   /* create cusparse matrix */
2529fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2530fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2531fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2532fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2533fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2534fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2535fcdce8c4SStefano Zampini 
2536fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2537fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2538fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2539fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2540fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2541fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2542fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2543fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2544fcdce8c4SStefano Zampini   } else {
2545fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2546fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2547fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2548fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2549fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2550fcdce8c4SStefano Zampini   }
2551fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2552fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2553fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2554fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2555fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2556fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2557fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2558fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2559fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2560fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2561fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2562fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2563fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2564fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2565fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2566fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2567fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2568fcdce8c4SStefano Zampini     c->nz = 0;
2569fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2570fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2571fcdce8c4SStefano Zampini     goto finalizesym;
2572fcdce8c4SStefano Zampini   }
2573fcdce8c4SStefano Zampini 
2574e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2575e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2576fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2577fcdce8c4SStefano Zampini   if (!biscompressed) {
2578fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2579fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2580fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2581fcdce8c4SStefano Zampini #endif
2582fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2583fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2584fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2585fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2586fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2587fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2588fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2589fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2590fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2591fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2592fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2593fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2594fcdce8c4SStefano Zampini     }
2595fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2596fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2597fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2599fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2600fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2601fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2602fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2603fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2604fcdce8c4SStefano Zampini     }
2605fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2606fcdce8c4SStefano Zampini #endif
2607fcdce8c4SStefano Zampini   }
2608e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2609e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2610fcdce8c4SStefano Zampini   /* precompute flops count */
2611fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2612fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2613fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2614fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2615fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2616fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2617fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2618fcdce8c4SStefano Zampini       }
2619fcdce8c4SStefano Zampini     }
2620fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2621fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2622fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2623fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2624fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2625fcdce8c4SStefano Zampini     }
2626fcdce8c4SStefano Zampini   } else { /* TODO */
2627fcdce8c4SStefano Zampini     flops = 0.;
2628fcdce8c4SStefano Zampini   }
2629fcdce8c4SStefano Zampini 
2630fcdce8c4SStefano Zampini   mmdata->flops = flops;
2631fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2632b4285af6SJunchao Zhang 
2633fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2634fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2635fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2636fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2637fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2638fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2639fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2640b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2641b4285af6SJunchao Zhang  {
2642b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2643b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2644b4285af6SJunchao Zhang   */
2645b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2646b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2647b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2648b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2649b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2650b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2651b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2652b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2653b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2654b4285af6SJunchao Zhang 
2655b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2656b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2657b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2658b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2659b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2660b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2661b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2662b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2663b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2664b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2665b4285af6SJunchao Zhang 
2666b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2667b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2670b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2671b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2672b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2673b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2674b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2675b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2676b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2677b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2678b4285af6SJunchao Zhang 
2679b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2680b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2681b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2682b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2683b4285af6SJunchao Zhang   /* allocate matrix C */
2684b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2685b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2686b4285af6SJunchao Zhang   /* update matC with the new pointers */
2687b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2688b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2689b4285af6SJunchao Zhang 
2690b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2691b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2692b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2693b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2694b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2695b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2696b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2697b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2698b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2699b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2700b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2702b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2703b4285af6SJunchao Zhang   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2704b4285af6SJunchao Zhang  }
2705b4285af6SJunchao Zhang  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2706b4285af6SJunchao Zhang   size_t bufSize2;
2707fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2708b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2712bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2713fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2714b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2715fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2716fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2717fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2718fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2719b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2720fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2721fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2722fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2723fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2724fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2725fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2726fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2727fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2728bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2729fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2730b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2731fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2732fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2733fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2734fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2735fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2736fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
273700702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2738fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2739fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2740fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2741fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2742fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2743fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2744b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2745fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2746fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2747b4285af6SJunchao Zhang  #endif
2748fcdce8c4SStefano Zampini #else
2749fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2750b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2751fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2752fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2753fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2754fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2755fcdce8c4SStefano Zampini   c->nz = cnz;
2756fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2757fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2758fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2759fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2760fcdce8c4SStefano Zampini 
2761fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2762fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2763fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2764fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2765b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2766fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2767fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2768fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2769fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2770fcdce8c4SStefano Zampini #endif
2771fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2772fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2773fcdce8c4SStefano Zampini finalizesym:
2774fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2775fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2776fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2777fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2778fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2779fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2780fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2781fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2782fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2783fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2784fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2785fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2786fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2787fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2788fcdce8c4SStefano Zampini   } else {
2789fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2790fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2791fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2792fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2793fcdce8c4SStefano Zampini   }
2794fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2795fcdce8c4SStefano Zampini     PetscInt r = 0;
2796fcdce8c4SStefano Zampini     c->i[0] = 0;
2797fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2798fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2799fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2800fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2801fcdce8c4SStefano Zampini     }
2802fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2803fcdce8c4SStefano Zampini   }
2804fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2805fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2806fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2807fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2808fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2809fcdce8c4SStefano Zampini   c->rmax = 0;
2810fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2811fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2812fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2813fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2814fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2815fcdce8c4SStefano Zampini   }
2816fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2817fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2818fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2819fcdce8c4SStefano Zampini 
2820fcdce8c4SStefano Zampini   C->nonzerostate++;
2821fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2822fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2823fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2824fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2825fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2826fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2827fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2828abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2829fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2830fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2831fcdce8c4SStefano Zampini   }
2832fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2833fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2834fcdce8c4SStefano Zampini }
2835fcdce8c4SStefano Zampini 
2836fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2837fcdce8c4SStefano Zampini 
2838fcdce8c4SStefano Zampini /* handles sparse or dense B */
2839fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2840fcdce8c4SStefano Zampini {
2841fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2842fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2843fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2844fcdce8c4SStefano Zampini 
2845fcdce8c4SStefano Zampini   PetscFunctionBegin;
2846fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2847fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2848abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2849fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2850fcdce8c4SStefano Zampini   }
2851fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2852fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2853fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2854fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2855fcdce8c4SStefano Zampini     }
2856fcdce8c4SStefano Zampini   }
285765e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
285865e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
285965e4b4d4SStefano Zampini     switch (product->type) {
286065e4b4d4SStefano Zampini     case MATPRODUCT_AB:
286165e4b4d4SStefano Zampini       if (product->api_user) {
286265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
286365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
286465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
286565e4b4d4SStefano Zampini       } else {
286665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
286765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
286865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
286965e4b4d4SStefano Zampini       }
287065e4b4d4SStefano Zampini       break;
287165e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
287265e4b4d4SStefano Zampini       if (product->api_user) {
287365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
287465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
287565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
287665e4b4d4SStefano Zampini       } else {
287765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
287865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
287965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
288065e4b4d4SStefano Zampini       }
288165e4b4d4SStefano Zampini       break;
288265e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
288365e4b4d4SStefano Zampini       if (product->api_user) {
288465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
288565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
288665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
288765e4b4d4SStefano Zampini       } else {
288865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
288965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
289065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
289165e4b4d4SStefano Zampini       }
289265e4b4d4SStefano Zampini       break;
289365e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
289465e4b4d4SStefano Zampini       if (product->api_user) {
289565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
289665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
289765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
289865e4b4d4SStefano Zampini       } else {
289965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
290065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
290165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
290265e4b4d4SStefano Zampini       }
290365e4b4d4SStefano Zampini       break;
290465e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
290565e4b4d4SStefano Zampini       if (product->api_user) {
290665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
290765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
290865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
290965e4b4d4SStefano Zampini       } else {
291065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
291165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291365e4b4d4SStefano Zampini       }
291465e4b4d4SStefano Zampini       break;
291565e4b4d4SStefano Zampini     default:
291665e4b4d4SStefano Zampini       break;
291765e4b4d4SStefano Zampini     }
291865e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
291965e4b4d4SStefano Zampini   }
292065e4b4d4SStefano Zampini   /* dispatch */
2921fcdce8c4SStefano Zampini   if (isdense) {
2922ccdfe979SStefano Zampini     switch (product->type) {
2923ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2924ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2925ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2926ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2927ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2928fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2929fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2930fcdce8c4SStefano Zampini       } else {
2931fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2932fcdce8c4SStefano Zampini       }
2933fcdce8c4SStefano Zampini       break;
2934fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2935fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2936fcdce8c4SStefano Zampini       break;
2937ccdfe979SStefano Zampini     default:
2938ccdfe979SStefano Zampini       break;
2939ccdfe979SStefano Zampini     }
2940fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2941fcdce8c4SStefano Zampini     switch (product->type) {
2942fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2943fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2944fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2945fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2946fcdce8c4SStefano Zampini       break;
2947fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2948fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2949fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2950fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2951fcdce8c4SStefano Zampini       break;
2952fcdce8c4SStefano Zampini     default:
2953fcdce8c4SStefano Zampini       break;
2954fcdce8c4SStefano Zampini     }
2955fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2956fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2957fcdce8c4SStefano Zampini   }
2958ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2959ccdfe979SStefano Zampini }
2960ccdfe979SStefano Zampini 
29616fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29629ae82921SPaul Mullowney {
2963b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29649ae82921SPaul Mullowney 
29659ae82921SPaul Mullowney   PetscFunctionBegin;
2966e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2967e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2968e6e9a74fSStefano Zampini }
2969e6e9a74fSStefano Zampini 
2970e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2971e6e9a74fSStefano Zampini {
2972e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2973e6e9a74fSStefano Zampini 
2974e6e9a74fSStefano Zampini   PetscFunctionBegin;
2975e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2976e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2977e6e9a74fSStefano Zampini }
2978e6e9a74fSStefano Zampini 
2979e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2980e6e9a74fSStefano Zampini {
2981e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2982e6e9a74fSStefano Zampini 
2983e6e9a74fSStefano Zampini   PetscFunctionBegin;
2984e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2985e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2986e6e9a74fSStefano Zampini }
2987e6e9a74fSStefano Zampini 
2988e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2989e6e9a74fSStefano Zampini {
2990e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2991e6e9a74fSStefano Zampini 
2992e6e9a74fSStefano Zampini   PetscFunctionBegin;
2993e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
29949ae82921SPaul Mullowney   PetscFunctionReturn(0);
29959ae82921SPaul Mullowney }
29969ae82921SPaul Mullowney 
29976fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2998ca45077fSPaul Mullowney {
2999b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3000ca45077fSPaul Mullowney 
3001ca45077fSPaul Mullowney   PetscFunctionBegin;
3002e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3003ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3004ca45077fSPaul Mullowney }
3005ca45077fSPaul Mullowney 
3006a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3007a0e72f99SJunchao Zhang {
3008a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3009a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3010a0e72f99SJunchao Zhang }
3011a0e72f99SJunchao Zhang 
3012afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3013e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30149ae82921SPaul Mullowney {
30159ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3016aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30179ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3018e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3019b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3020aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3021e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3022e6e9a74fSStefano Zampini   PetscBool                    compressed;
3023afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3024afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3025afb2bd1cSJunchao Zhang #endif
30266e111a19SKarl Rupp 
30279ae82921SPaul Mullowney   PetscFunctionBegin;
3028e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3029e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3030afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3031d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3032e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3033e6e9a74fSStefano Zampini   }
303434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
303534d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3036e6e9a74fSStefano Zampini   if (!trans) {
30379ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3038e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3039e6e9a74fSStefano Zampini   } else {
30401a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3041e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3042e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3043e6e9a74fSStefano Zampini     } else {
30443606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3045e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3046e6e9a74fSStefano Zampini     }
3047e6e9a74fSStefano Zampini   }
3048e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3049e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3050213423ffSJunchao Zhang 
3051e6e9a74fSStefano Zampini   try {
3052e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3053213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3054213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3055afb2bd1cSJunchao Zhang 
305685ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3057e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3058afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3059afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3060afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3061afb2bd1cSJunchao Zhang       */
3062e6e9a74fSStefano Zampini       xptr = xarray;
3063afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3064213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3065afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3066afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3067afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3068afb2bd1cSJunchao Zhang        */
3069afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3070afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3071afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3072afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3073afb2bd1cSJunchao Zhang       }
3074afb2bd1cSJunchao Zhang      #endif
3075e6e9a74fSStefano Zampini     } else {
3076afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3077afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3078afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3079afb2bd1cSJunchao Zhang        */
3080afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3081e6e9a74fSStefano Zampini       dptr = zarray;
3082e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3083afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3084e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3085a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3086e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3087e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3088e6e9a74fSStefano Zampini       }
3089afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3090afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3091afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3092afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3093afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3094afb2bd1cSJunchao Zhang       }
3095afb2bd1cSJunchao Zhang      #endif
3096e6e9a74fSStefano Zampini     }
30979ae82921SPaul Mullowney 
3098afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3099aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3100afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3101afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3102afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3103ee7b52eaSHong Zhang         cudaError_t cerr;
3104afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3105afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3106afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3107afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3108afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3109afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3110afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3111afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3112afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3113afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3114afb2bd1cSJunchao Zhang 
3115afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3116afb2bd1cSJunchao Zhang       } else {
3117afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3118afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3119afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3120afb2bd1cSJunchao Zhang       }
3121afb2bd1cSJunchao Zhang 
3122afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3123afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31243606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3125afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3126afb2bd1cSJunchao Zhang                                beta,
3127afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3128afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3129afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3130afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3131afb2bd1cSJunchao Zhang      #else
31327656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3133e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3134a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3135afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3136aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3137e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
313857d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3139afb2bd1cSJunchao Zhang      #endif
3140aa372e3fSPaul Mullowney     } else {
3141213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3142afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3143afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3144afb2bd1cSJunchao Zhang        #else
3145301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3146e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3147afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3148e6e9a74fSStefano Zampini                                  xptr, beta,
314957d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3150afb2bd1cSJunchao Zhang        #endif
3151a65300a6SPaul Mullowney       }
3152aa372e3fSPaul Mullowney     }
3153958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3154aa372e3fSPaul Mullowney 
3155e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3156213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3157213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3158213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3159e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3160213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
31617656d835SStefano Zampini         }
3162213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3163c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
31647656d835SStefano Zampini       }
31657656d835SStefano Zampini 
3166213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3167213423ffSJunchao Zhang       if (compressed) {
3168e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3169a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3170a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3171a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3172a0e72f99SJunchao Zhang          */
3173a0e72f99SJunchao Zhang        #if 0
3174a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3175a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3176a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3177e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3178c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3179a0e72f99SJunchao Zhang        #else
3180a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3181a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3182a0e72f99SJunchao Zhang        #endif
3183958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3184e6e9a74fSStefano Zampini       }
3185e6e9a74fSStefano Zampini     } else {
3186e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3187e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3188e6e9a74fSStefano Zampini       }
3189e6e9a74fSStefano Zampini     }
3190e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3191213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3192213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
31939ae82921SPaul Mullowney   } catch(char *ex) {
31949ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31959ae82921SPaul Mullowney   }
3196e6e9a74fSStefano Zampini   if (yy) {
3197958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3198e6e9a74fSStefano Zampini   } else {
3199e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3200e6e9a74fSStefano Zampini   }
32019ae82921SPaul Mullowney   PetscFunctionReturn(0);
32029ae82921SPaul Mullowney }
32039ae82921SPaul Mullowney 
32046fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3205ca45077fSPaul Mullowney {
3206b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32076e111a19SKarl Rupp 
3208ca45077fSPaul Mullowney   PetscFunctionBegin;
3209e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3210ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3211ca45077fSPaul Mullowney }
3212ca45077fSPaul Mullowney 
32136fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32149ae82921SPaul Mullowney {
32159ae82921SPaul Mullowney   PetscErrorCode     ierr;
3216042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3217042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32183fa6b06aSMark Adams 
3219042217e8SBarry Smith   PetscFunctionBegin;
3220042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3221042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3222042217e8SBarry Smith     cudaError_t cerr;
3223042217e8SBarry Smith 
3224042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3225042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3226042217e8SBarry Smith     cusp->deviceMat = NULL;
3227042217e8SBarry Smith   }
32289ae82921SPaul Mullowney   PetscFunctionReturn(0);
32299ae82921SPaul Mullowney }
32309ae82921SPaul Mullowney 
32319ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3232e057df02SPaul Mullowney /*@
32339ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3234e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3235e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3236e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3237e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3238e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32399ae82921SPaul Mullowney 
3240d083f849SBarry Smith    Collective
32419ae82921SPaul Mullowney 
32429ae82921SPaul Mullowney    Input Parameters:
32439ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32449ae82921SPaul Mullowney .  m - number of rows
32459ae82921SPaul Mullowney .  n - number of columns
32469ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32479ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32480298fd71SBarry Smith          (possibly different for each row) or NULL
32499ae82921SPaul Mullowney 
32509ae82921SPaul Mullowney    Output Parameter:
32519ae82921SPaul Mullowney .  A - the matrix
32529ae82921SPaul Mullowney 
32539ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32549ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32559ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32569ae82921SPaul Mullowney 
32579ae82921SPaul Mullowney    Notes:
32589ae82921SPaul Mullowney    If nnz is given then nz is ignored
32599ae82921SPaul Mullowney 
32609ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32619ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32629ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32639ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32649ae82921SPaul Mullowney 
32659ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32660298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32679ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32689ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32699ae82921SPaul Mullowney 
32709ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32719ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32729ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32739ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32749ae82921SPaul Mullowney 
32759ae82921SPaul Mullowney    Level: intermediate
32769ae82921SPaul Mullowney 
3277e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
32789ae82921SPaul Mullowney @*/
32799ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32809ae82921SPaul Mullowney {
32819ae82921SPaul Mullowney   PetscErrorCode ierr;
32829ae82921SPaul Mullowney 
32839ae82921SPaul Mullowney   PetscFunctionBegin;
32849ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
32859ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
32869ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32879ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
32889ae82921SPaul Mullowney   PetscFunctionReturn(0);
32899ae82921SPaul Mullowney }
32909ae82921SPaul Mullowney 
32916fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32929ae82921SPaul Mullowney {
32939ae82921SPaul Mullowney   PetscErrorCode ierr;
3294ab25e6cbSDominic Meiser 
32959ae82921SPaul Mullowney   PetscFunctionBegin;
32969ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3297470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
32989ae82921SPaul Mullowney   } else {
3299470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3300aa372e3fSPaul Mullowney   }
3301c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3302ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3303ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3304ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3305fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3306ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33077e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33087e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3309ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33109ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33119ae82921SPaul Mullowney   PetscFunctionReturn(0);
33129ae82921SPaul Mullowney }
33139ae82921SPaul Mullowney 
3314ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
331595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33169ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33179ff858a8SKarl Rupp {
33189ff858a8SKarl Rupp   PetscErrorCode ierr;
33199ff858a8SKarl Rupp 
33209ff858a8SKarl Rupp   PetscFunctionBegin;
33219ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3322ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33239ff858a8SKarl Rupp   PetscFunctionReturn(0);
33249ff858a8SKarl Rupp }
33259ff858a8SKarl Rupp 
3326039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
332795639643SRichard Tran Mills {
3328e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3329a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3330039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3331039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3332039c6fbaSStefano Zampini   PetscScalar        *ay;
3333039c6fbaSStefano Zampini   const PetscScalar  *ax;
3334039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3335e6e9a74fSStefano Zampini 
333695639643SRichard Tran Mills   PetscFunctionBegin;
3337a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3338a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3339039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3340a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3341a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3342a587d139SMark     PetscFunctionReturn(0);
334395639643SRichard Tran Mills   }
3344039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3345a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3346a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3347e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3348e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3349039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3350039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3351039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3352039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3353039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3354039c6fbaSStefano Zampini     if (eq) {
3355039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3356039c6fbaSStefano Zampini     }
3357039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3358039c6fbaSStefano Zampini   }
3359d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3360d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3361039c6fbaSStefano Zampini 
3362039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3363039c6fbaSStefano Zampini     cusparseStatus_t stat;
3364039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3365039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3366039c6fbaSStefano Zampini     size_t           bufferSize;
3367039c6fbaSStefano Zampini     void             *buffer;
3368ee7b52eaSHong Zhang     cudaError_t      cerr;
3369039c6fbaSStefano Zampini #endif
3370039c6fbaSStefano Zampini 
3371039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3372039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3373039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3374039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3375039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3376039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3377039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3378039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3379039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3380039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3381039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3382039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3383039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3384039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3385039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3386039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3387039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3388039c6fbaSStefano Zampini #else
3389039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3390039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3391039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3392039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3393039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3394039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3395039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3396039c6fbaSStefano Zampini #endif
3397039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3398039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3399039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3400039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3401039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3402a587d139SMark     cublasHandle_t cublasv2handle;
3403039c6fbaSStefano Zampini     cublasStatus_t berr;
3404a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3405039c6fbaSStefano Zampini 
3406039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3407039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3408a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3409a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3410a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3411039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3412a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3413a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3414039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3415039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3416a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3417039c6fbaSStefano Zampini   } else {
3418a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3419d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3420a587d139SMark   }
342195639643SRichard Tran Mills   PetscFunctionReturn(0);
342295639643SRichard Tran Mills }
342395639643SRichard Tran Mills 
342433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
342533c9ba73SStefano Zampini {
342633c9ba73SStefano Zampini   PetscErrorCode ierr;
342733c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
342833c9ba73SStefano Zampini   PetscScalar    *ay;
342933c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
343033c9ba73SStefano Zampini   cublasStatus_t berr;
343133c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
343233c9ba73SStefano Zampini 
343333c9ba73SStefano Zampini   PetscFunctionBegin;
343433c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
343533c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
343633c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
343733c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
343833c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
343933c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
344033c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
344133c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
344233c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
344333c9ba73SStefano Zampini   PetscFunctionReturn(0);
344433c9ba73SStefano Zampini }
344533c9ba73SStefano Zampini 
34463fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34473fa6b06aSMark Adams {
34483fa6b06aSMark Adams   PetscErrorCode ierr;
34497e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3450a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34517e8381f9SStefano Zampini 
34523fa6b06aSMark Adams   PetscFunctionBegin;
34533fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34543fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34557e8381f9SStefano Zampini     if (spptr->mat) {
34567e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34577e8381f9SStefano Zampini       if (matrix->values) {
34587e8381f9SStefano Zampini         both = PETSC_TRUE;
34597e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34607e8381f9SStefano Zampini       }
34617e8381f9SStefano Zampini     }
34627e8381f9SStefano Zampini     if (spptr->matTranspose) {
34637e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34647e8381f9SStefano Zampini       if (matrix->values) {
34657e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34667e8381f9SStefano Zampini       }
34677e8381f9SStefano Zampini     }
34683fa6b06aSMark Adams   }
3469a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3470a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3471a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
34727e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3473a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34743fa6b06aSMark Adams   PetscFunctionReturn(0);
34753fa6b06aSMark Adams }
34763fa6b06aSMark Adams 
3477a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3478a587d139SMark {
3479a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3480a587d139SMark   PetscErrorCode ierr;
3481a587d139SMark 
3482a587d139SMark   PetscFunctionBegin;
3483a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3484a587d139SMark   if (flg) {
3485a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3486a587d139SMark 
348733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3488a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3489a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3490a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3491a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3492a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3493a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3494a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3495a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3496fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3497*67a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3498c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3499a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3500a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3501a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3502a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3503a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3504fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3505a587d139SMark   } else {
350633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3507a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3508a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3509a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3510a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3511a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3512a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3513a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3514a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3515fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3516*67a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
3517*67a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
3518*67a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
3519*67a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
3520*67a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
3521*67a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3522c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3523a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3524a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3525a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3526a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3527fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3528a587d139SMark   }
3529a587d139SMark   A->boundtocpu = flg;
3530ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3531ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3532ea500dcfSRichard Tran Mills   } else {
3533ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3534ea500dcfSRichard Tran Mills   }
3535a587d139SMark   PetscFunctionReturn(0);
3536a587d139SMark }
3537a587d139SMark 
353849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35399ae82921SPaul Mullowney {
35409ae82921SPaul Mullowney   PetscErrorCode   ierr;
3541aa372e3fSPaul Mullowney   cusparseStatus_t stat;
354249735bf3SStefano Zampini   Mat              B;
35439ae82921SPaul Mullowney 
35449ae82921SPaul Mullowney   PetscFunctionBegin;
3545a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
354649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
354749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
354849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
354949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
355049735bf3SStefano Zampini   }
355149735bf3SStefano Zampini   B = *newmat;
355249735bf3SStefano Zampini 
355334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
355434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
355534136279SStefano Zampini 
355649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
35579ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3558e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3559e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3560e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3561a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
35621a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3563d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3564a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3565a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3566a435da06SStefano Zampini      #else
3567d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3568a435da06SStefano Zampini      #endif
3569d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3570d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3571d8132acaSStefano Zampini      #endif
35721a2c6b5cSJunchao Zhang       B->spptr = spptr;
35739ae82921SPaul Mullowney     } else {
3574e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3575e6e9a74fSStefano Zampini 
3576e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3577e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3578a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3579e6e9a74fSStefano Zampini       B->spptr = spptr;
35809ae82921SPaul Mullowney     }
3581e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
358249735bf3SStefano Zampini   }
3583693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35849ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35851a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35869ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
358795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3588693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35892205254eSKarl Rupp 
3590e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
35919ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3592bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3593ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3594ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3595ae48a8d0SStefano Zampini #endif
35969ae82921SPaul Mullowney   PetscFunctionReturn(0);
35979ae82921SPaul Mullowney }
35989ae82921SPaul Mullowney 
359902fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
360002fe1965SBarry Smith {
360102fe1965SBarry Smith   PetscErrorCode ierr;
360202fe1965SBarry Smith 
360302fe1965SBarry Smith   PetscFunctionBegin;
360402fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36050ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
360602fe1965SBarry Smith   PetscFunctionReturn(0);
360702fe1965SBarry Smith }
360802fe1965SBarry Smith 
36093ca39a21SBarry Smith /*MC
3610e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3611e057df02SPaul Mullowney 
3612e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36132692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36142692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3615e057df02SPaul Mullowney 
3616e057df02SPaul Mullowney    Options Database Keys:
3617e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3618aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3619a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3620e057df02SPaul Mullowney 
3621e057df02SPaul Mullowney   Level: beginner
3622e057df02SPaul Mullowney 
36238468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3624e057df02SPaul Mullowney M*/
36257f756511SDominic Meiser 
3626bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36270f39cd5aSBarry Smith 
36283ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
362942c9c57cSBarry Smith {
363042c9c57cSBarry Smith   PetscErrorCode ierr;
363142c9c57cSBarry Smith 
363242c9c57cSBarry Smith   PetscFunctionBegin;
3633bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36343ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36353ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36363ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36373ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3638bddcd29dSMark Adams 
363942c9c57cSBarry Smith   PetscFunctionReturn(0);
364042c9c57cSBarry Smith }
364129b38603SBarry Smith 
3642470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36437f756511SDominic Meiser {
3644e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
36457f756511SDominic Meiser   cusparseStatus_t stat;
36467f756511SDominic Meiser 
36477f756511SDominic Meiser   PetscFunctionBegin;
36487f756511SDominic Meiser   if (*cusparsestruct) {
3649e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3650e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
36517f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
365281902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
36537e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
36547e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3655a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
36567e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3657e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
36587f756511SDominic Meiser   }
36597f756511SDominic Meiser   PetscFunctionReturn(0);
36607f756511SDominic Meiser }
36617f756511SDominic Meiser 
36627f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36637f756511SDominic Meiser {
36647f756511SDominic Meiser   PetscFunctionBegin;
36657f756511SDominic Meiser   if (*mat) {
36667f756511SDominic Meiser     delete (*mat)->values;
36677f756511SDominic Meiser     delete (*mat)->column_indices;
36687f756511SDominic Meiser     delete (*mat)->row_offsets;
36697f756511SDominic Meiser     delete *mat;
36707f756511SDominic Meiser     *mat = 0;
36717f756511SDominic Meiser   }
36727f756511SDominic Meiser   PetscFunctionReturn(0);
36737f756511SDominic Meiser }
36747f756511SDominic Meiser 
3675470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36767f756511SDominic Meiser {
36777f756511SDominic Meiser   cusparseStatus_t stat;
36787f756511SDominic Meiser   PetscErrorCode   ierr;
36797f756511SDominic Meiser 
36807f756511SDominic Meiser   PetscFunctionBegin;
36817f756511SDominic Meiser   if (*trifactor) {
368257d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3683afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
36847f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
36851b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
36862cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3687afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36881b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3689afb2bd1cSJunchao Zhang    #endif
3690da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
36917f756511SDominic Meiser   }
36927f756511SDominic Meiser   PetscFunctionReturn(0);
36937f756511SDominic Meiser }
36947f756511SDominic Meiser 
3695470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36967f756511SDominic Meiser {
36977f756511SDominic Meiser   CsrMatrix        *mat;
36987f756511SDominic Meiser   cusparseStatus_t stat;
36997f756511SDominic Meiser   cudaError_t      err;
37007f756511SDominic Meiser 
37017f756511SDominic Meiser   PetscFunctionBegin;
37027f756511SDominic Meiser   if (*matstruct) {
37037f756511SDominic Meiser     if ((*matstruct)->mat) {
37047f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3705afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3706afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3707afb2bd1cSJunchao Zhang        #else
37087f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
370957d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3710afb2bd1cSJunchao Zhang        #endif
37117f756511SDominic Meiser       } else {
37127f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37137f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37147f756511SDominic Meiser       }
37157f756511SDominic Meiser     }
371657d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37177f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3718afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37197656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37207656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3721afb2bd1cSJunchao Zhang 
3722afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3723afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3724afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3725afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3726afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3727afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3728afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3729afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3730afb2bd1cSJunchao Zhang       }
3731afb2bd1cSJunchao Zhang     }
3732afb2bd1cSJunchao Zhang    #endif
37337f756511SDominic Meiser     delete *matstruct;
37347e8381f9SStefano Zampini     *matstruct = NULL;
37357f756511SDominic Meiser   }
37367f756511SDominic Meiser   PetscFunctionReturn(0);
37377f756511SDominic Meiser }
37387f756511SDominic Meiser 
3739e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37407f756511SDominic Meiser {
3741e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3742e6e9a74fSStefano Zampini 
37437f756511SDominic Meiser   PetscFunctionBegin;
37447f756511SDominic Meiser   if (*trifactors) {
3745e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3746e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3747e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3748e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
37497f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
37507f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
37517f756511SDominic Meiser     delete (*trifactors)->workVector;
37527e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
37537e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
37547e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3755bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3756bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3757e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3758ccdfe979SStefano Zampini   }
3759ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3760ccdfe979SStefano Zampini }
3761ccdfe979SStefano Zampini 
3762ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3763ccdfe979SStefano Zampini {
3764e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3765ccdfe979SStefano Zampini   cusparseHandle_t handle;
3766ccdfe979SStefano Zampini   cusparseStatus_t stat;
3767ccdfe979SStefano Zampini 
3768ccdfe979SStefano Zampini   PetscFunctionBegin;
3769ccdfe979SStefano Zampini   if (*trifactors) {
3770e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
37717f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
377257d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
37737f756511SDominic Meiser     }
3774e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
37757f756511SDominic Meiser   }
37767f756511SDominic Meiser   PetscFunctionReturn(0);
37777f756511SDominic Meiser }
37787e8381f9SStefano Zampini 
37797e8381f9SStefano Zampini struct IJCompare
37807e8381f9SStefano Zampini {
37817e8381f9SStefano Zampini   __host__ __device__
37827e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37837e8381f9SStefano Zampini   {
37847e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37857e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37867e8381f9SStefano Zampini     return false;
37877e8381f9SStefano Zampini   }
37887e8381f9SStefano Zampini };
37897e8381f9SStefano Zampini 
37907e8381f9SStefano Zampini struct IJEqual
37917e8381f9SStefano Zampini {
37927e8381f9SStefano Zampini   __host__ __device__
37937e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37947e8381f9SStefano Zampini   {
37957e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37967e8381f9SStefano Zampini     return true;
37977e8381f9SStefano Zampini   }
37987e8381f9SStefano Zampini };
37997e8381f9SStefano Zampini 
38007e8381f9SStefano Zampini struct IJDiff
38017e8381f9SStefano Zampini {
38027e8381f9SStefano Zampini   __host__ __device__
38037e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38047e8381f9SStefano Zampini   {
38057e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38067e8381f9SStefano Zampini   }
38077e8381f9SStefano Zampini };
38087e8381f9SStefano Zampini 
38097e8381f9SStefano Zampini struct IJSum
38107e8381f9SStefano Zampini {
38117e8381f9SStefano Zampini   __host__ __device__
38127e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38137e8381f9SStefano Zampini   {
38147e8381f9SStefano Zampini     return t1||t2;
38157e8381f9SStefano Zampini   }
38167e8381f9SStefano Zampini };
38177e8381f9SStefano Zampini 
38187e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3819e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
38207e8381f9SStefano Zampini {
38217e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3822fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3823bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
382408391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38257e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38267e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38277e8381f9SStefano Zampini   PetscInt                              n;
38287e8381f9SStefano Zampini 
38297e8381f9SStefano Zampini   PetscFunctionBegin;
38307e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38317e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38327e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38337e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38347e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38357e8381f9SStefano Zampini     PetscFunctionReturn(0);
38367e8381f9SStefano Zampini   }
38377e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38387e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3839e61fc153SStefano Zampini   if (!v) {
3840e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3841e61fc153SStefano Zampini     goto finalize;
38427e8381f9SStefano Zampini   }
3843e61fc153SStefano Zampini   n = cusp->cooPerm->size();
384408391a17SStefano Zampini   if (isCudaMem(v)) {
384508391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
384608391a17SStefano Zampini   } else {
3847e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3848e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
384908391a17SStefano Zampini     d_v = cooPerm_v->data();
3850e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
385108391a17SStefano Zampini   }
3852bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3853e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3854ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3855bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
385608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3857ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3858ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3859ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3860ddea5d60SJunchao Zhang       */
3861e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3862e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3863e61fc153SStefano Zampini       delete cooPerm_w;
38647e8381f9SStefano Zampini     } else {
3865ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
386608391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38677e8381f9SStefano Zampini                                                                 matrix->values->begin()));
386808391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38697e8381f9SStefano Zampini                                                                 matrix->values->end()));
3870ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38717e8381f9SStefano Zampini     }
38727e8381f9SStefano Zampini   } else {
3873e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
387408391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3875e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38767e8381f9SStefano Zampini     } else {
387708391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38787e8381f9SStefano Zampini                                                                 matrix->values->begin()));
387908391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38807e8381f9SStefano Zampini                                                                 matrix->values->end()));
38817e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38827e8381f9SStefano Zampini     }
38837e8381f9SStefano Zampini   }
3884bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3885e61fc153SStefano Zampini finalize:
3886e61fc153SStefano Zampini   delete cooPerm_v;
38877e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3888e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3889fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3890fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3891fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3892fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3893fcdce8c4SStefano Zampini   a->reallocs         = 0;
3894fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3895fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3896fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3897fcdce8c4SStefano Zampini   A->num_ass++;
38987e8381f9SStefano Zampini   PetscFunctionReturn(0);
38997e8381f9SStefano Zampini }
39007e8381f9SStefano Zampini 
3901a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3902a49f1ed0SStefano Zampini {
3903a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3904a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3905a49f1ed0SStefano Zampini 
3906a49f1ed0SStefano Zampini   PetscFunctionBegin;
3907a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3908a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3909a49f1ed0SStefano Zampini   if (destroy) {
3910a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3911a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3912a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3913a49f1ed0SStefano Zampini   }
39141a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3915a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3916a49f1ed0SStefano Zampini }
3917a49f1ed0SStefano Zampini 
39187e8381f9SStefano Zampini #include <thrust/binary_search.h>
3919e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
39207e8381f9SStefano Zampini {
39217e8381f9SStefano Zampini   PetscErrorCode     ierr;
39227e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39237e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39247e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39257e8381f9SStefano Zampini   cudaError_t        cerr;
39267e8381f9SStefano Zampini 
39277e8381f9SStefano Zampini   PetscFunctionBegin;
39287e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39297e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39307e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39317e8381f9SStefano Zampini   if (n != cooPerm_n) {
39327e8381f9SStefano Zampini     delete cusp->cooPerm;
39337e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39347e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39357e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39367e8381f9SStefano Zampini   }
39377e8381f9SStefano Zampini   if (n) {
39387e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39397e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39407e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39417e8381f9SStefano Zampini 
39427e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39437e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39447e8381f9SStefano Zampini 
39457e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
39467e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39477e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3948ddea5d60SJunchao Zhang 
3949ddea5d60SJunchao Zhang     /* Ex.
3950ddea5d60SJunchao Zhang       n = 6
3951ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3952ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3953ddea5d60SJunchao Zhang     */
39547e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
39557e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
39567e8381f9SStefano Zampini 
395708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39587e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3959ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3960ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39617e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39627e8381f9SStefano Zampini 
3963ddea5d60SJunchao Zhang     /*
3964ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3965ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3966ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3967ddea5d60SJunchao Zhang     */
3968ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3969ddea5d60SJunchao Zhang 
3970ddea5d60SJunchao Zhang     /*
3971ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3972ddea5d60SJunchao Zhang                             ^ekey
3973ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3974ddea5d60SJunchao Zhang                            ^nekye
3975ddea5d60SJunchao Zhang     */
39767e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39777e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39787e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3979ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3980ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3981ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3982ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3983ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39847e8381f9SStefano Zampini       w[0] = 0;
3985ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3986ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39877e8381f9SStefano Zampini     }
39887e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3989ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3990ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3991ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
399208391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
39937e8381f9SStefano Zampini 
39947e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
39957e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39967e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39977e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39987e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3999ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40007e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40017e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4002fcdce8c4SStefano Zampini     a->rmax = 0;
40037e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40047e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40057e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40067e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40077e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40087e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40097e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40107e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40117e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4012fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40137e8381f9SStefano Zampini     }
4014fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40157e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40167e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4017fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40187e8381f9SStefano Zampini   } else {
40197e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40207e8381f9SStefano Zampini   }
4021e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40227e8381f9SStefano Zampini 
40237e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4024e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4025e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40267e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40277e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40287e8381f9SStefano Zampini   A->nonzerostate++;
40297e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4030a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40317e8381f9SStefano Zampini 
40327e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40337e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40347e8381f9SStefano Zampini   PetscFunctionReturn(0);
40357e8381f9SStefano Zampini }
4036ed502f03SStefano Zampini 
40375b7e41feSStefano Zampini /*@C
40385b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40395b7e41feSStefano Zampini 
40405b7e41feSStefano Zampini    Not collective
40415b7e41feSStefano Zampini 
40425b7e41feSStefano Zampini     Input Parameters:
40435b7e41feSStefano Zampini +   A - the matrix
40445b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40455b7e41feSStefano Zampini 
40465b7e41feSStefano Zampini     Output Parameters:
40475b7e41feSStefano Zampini +   ia - the CSR row pointers
40485b7e41feSStefano Zampini -   ja - the CSR column indices
40495b7e41feSStefano Zampini 
40505b7e41feSStefano Zampini     Level: developer
40515b7e41feSStefano Zampini 
40525b7e41feSStefano Zampini     Notes:
40535b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
40545b7e41feSStefano Zampini 
40555b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
40565b7e41feSStefano Zampini @*/
40575f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
40585f101d05SStefano Zampini {
40595f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
40605f101d05SStefano Zampini   CsrMatrix          *csr;
40615f101d05SStefano Zampini   PetscErrorCode     ierr;
40625f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
40635f101d05SStefano Zampini 
40645f101d05SStefano Zampini   PetscFunctionBegin;
40655f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
40665f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
40675f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
40685f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
40695f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
40705f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
40715f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
40725f101d05SStefano Zampini   if (i) {
40735f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
40745f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
40755f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
40765f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
40775f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
40785f101d05SStefano Zampini       }
40795f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
40805f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
40815f101d05SStefano Zampini   }
40825f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
40835f101d05SStefano Zampini   PetscFunctionReturn(0);
40845f101d05SStefano Zampini }
40855f101d05SStefano Zampini 
40865b7e41feSStefano Zampini /*@C
40875b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
40885b7e41feSStefano Zampini 
40895b7e41feSStefano Zampini    Not collective
40905b7e41feSStefano Zampini 
40915b7e41feSStefano Zampini     Input Parameters:
40925b7e41feSStefano Zampini +   A - the matrix
40935b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40945b7e41feSStefano Zampini 
40955b7e41feSStefano Zampini     Output Parameters:
40965b7e41feSStefano Zampini +   ia - the CSR row pointers
40975b7e41feSStefano Zampini -   ja - the CSR column indices
40985b7e41feSStefano Zampini 
40995b7e41feSStefano Zampini     Level: developer
41005b7e41feSStefano Zampini 
41015b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41025b7e41feSStefano Zampini @*/
41035f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41045f101d05SStefano Zampini {
41055f101d05SStefano Zampini   PetscFunctionBegin;
41065f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41075f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41085f101d05SStefano Zampini   if (i) *i = NULL;
41095f101d05SStefano Zampini   if (j) *j = NULL;
41105f101d05SStefano Zampini   PetscFunctionReturn(0);
41115f101d05SStefano Zampini }
41125f101d05SStefano Zampini 
41135b7e41feSStefano Zampini /*@C
41145b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41155b7e41feSStefano Zampini 
41165b7e41feSStefano Zampini    Not Collective
41175b7e41feSStefano Zampini 
41185b7e41feSStefano Zampini    Input Parameter:
41195b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41205b7e41feSStefano Zampini 
41215b7e41feSStefano Zampini    Output Parameter:
41225b7e41feSStefano Zampini .   a - pointer to the device data
41235b7e41feSStefano Zampini 
41245b7e41feSStefano Zampini    Level: developer
41255b7e41feSStefano Zampini 
41265b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41275b7e41feSStefano Zampini 
41285b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41295b7e41feSStefano Zampini @*/
4130ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4131ed502f03SStefano Zampini {
4132ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4133ed502f03SStefano Zampini   CsrMatrix          *csr;
4134ed502f03SStefano Zampini   PetscErrorCode     ierr;
4135ed502f03SStefano Zampini 
4136ed502f03SStefano Zampini   PetscFunctionBegin;
4137ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4138ed502f03SStefano Zampini   PetscValidPointer(a,2);
4139ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4140ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4141ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
414233c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4143ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4144ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4145ed502f03SStefano Zampini   *a = csr->values->data().get();
4146ed502f03SStefano Zampini   PetscFunctionReturn(0);
4147ed502f03SStefano Zampini }
4148ed502f03SStefano Zampini 
41495b7e41feSStefano Zampini /*@C
41505b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
41515b7e41feSStefano Zampini 
41525b7e41feSStefano Zampini    Not Collective
41535b7e41feSStefano Zampini 
41545b7e41feSStefano Zampini    Input Parameter:
41555b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41565b7e41feSStefano Zampini 
41575b7e41feSStefano Zampini    Output Parameter:
41585b7e41feSStefano Zampini .   a - pointer to the device data
41595b7e41feSStefano Zampini 
41605b7e41feSStefano Zampini    Level: developer
41615b7e41feSStefano Zampini 
41625b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
41635b7e41feSStefano Zampini @*/
4164ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4165ed502f03SStefano Zampini {
4166ed502f03SStefano Zampini   PetscFunctionBegin;
4167ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4168ed502f03SStefano Zampini   PetscValidPointer(a,2);
4169ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4170ed502f03SStefano Zampini   *a = NULL;
4171ed502f03SStefano Zampini   PetscFunctionReturn(0);
4172ed502f03SStefano Zampini }
4173ed502f03SStefano Zampini 
41745b7e41feSStefano Zampini /*@C
41755b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41765b7e41feSStefano Zampini 
41775b7e41feSStefano Zampini    Not Collective
41785b7e41feSStefano Zampini 
41795b7e41feSStefano Zampini    Input Parameter:
41805b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41815b7e41feSStefano Zampini 
41825b7e41feSStefano Zampini    Output Parameter:
41835b7e41feSStefano Zampini .   a - pointer to the device data
41845b7e41feSStefano Zampini 
41855b7e41feSStefano Zampini    Level: developer
41865b7e41feSStefano Zampini 
41875b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41885b7e41feSStefano Zampini 
41895b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
41905b7e41feSStefano Zampini @*/
4191039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4192039c6fbaSStefano Zampini {
4193039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4194039c6fbaSStefano Zampini   CsrMatrix          *csr;
4195039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4196039c6fbaSStefano Zampini 
4197039c6fbaSStefano Zampini   PetscFunctionBegin;
4198039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4199039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4200039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4201039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4202039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
420333c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4204039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4205039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4206039c6fbaSStefano Zampini   *a = csr->values->data().get();
4207039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4208a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4209039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4210039c6fbaSStefano Zampini }
42115b7e41feSStefano Zampini /*@C
42125b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4213039c6fbaSStefano Zampini 
42145b7e41feSStefano Zampini    Not Collective
42155b7e41feSStefano Zampini 
42165b7e41feSStefano Zampini    Input Parameter:
42175b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42185b7e41feSStefano Zampini 
42195b7e41feSStefano Zampini    Output Parameter:
42205b7e41feSStefano Zampini .   a - pointer to the device data
42215b7e41feSStefano Zampini 
42225b7e41feSStefano Zampini    Level: developer
42235b7e41feSStefano Zampini 
42245b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42255b7e41feSStefano Zampini @*/
4226039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4227039c6fbaSStefano Zampini {
4228039c6fbaSStefano Zampini   PetscErrorCode ierr;
4229039c6fbaSStefano Zampini 
4230039c6fbaSStefano Zampini   PetscFunctionBegin;
4231039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4232039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4233039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4234039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4235039c6fbaSStefano Zampini   *a = NULL;
4236039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4237039c6fbaSStefano Zampini }
4238039c6fbaSStefano Zampini 
42395b7e41feSStefano Zampini /*@C
42405b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42415b7e41feSStefano Zampini 
42425b7e41feSStefano Zampini    Not Collective
42435b7e41feSStefano Zampini 
42445b7e41feSStefano Zampini    Input Parameter:
42455b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42465b7e41feSStefano Zampini 
42475b7e41feSStefano Zampini    Output Parameter:
42485b7e41feSStefano Zampini .   a - pointer to the device data
42495b7e41feSStefano Zampini 
42505b7e41feSStefano Zampini    Level: developer
42515b7e41feSStefano Zampini 
42525b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
42535b7e41feSStefano Zampini 
42545b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
42555b7e41feSStefano Zampini @*/
4256ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4257ed502f03SStefano Zampini {
4258ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4259ed502f03SStefano Zampini   CsrMatrix          *csr;
4260a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4261ed502f03SStefano Zampini 
4262ed502f03SStefano Zampini   PetscFunctionBegin;
4263ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4264ed502f03SStefano Zampini   PetscValidPointer(a,2);
4265ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4266ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
426733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4268ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4269ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4270ed502f03SStefano Zampini   *a = csr->values->data().get();
4271039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4272a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4273ed502f03SStefano Zampini   PetscFunctionReturn(0);
4274ed502f03SStefano Zampini }
4275ed502f03SStefano Zampini 
42765b7e41feSStefano Zampini /*@C
42775b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
42785b7e41feSStefano Zampini 
42795b7e41feSStefano Zampini    Not Collective
42805b7e41feSStefano Zampini 
42815b7e41feSStefano Zampini    Input Parameter:
42825b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42835b7e41feSStefano Zampini 
42845b7e41feSStefano Zampini    Output Parameter:
42855b7e41feSStefano Zampini .   a - pointer to the device data
42865b7e41feSStefano Zampini 
42875b7e41feSStefano Zampini    Level: developer
42885b7e41feSStefano Zampini 
42895b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
42905b7e41feSStefano Zampini @*/
4291ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4292ed502f03SStefano Zampini {
4293ed502f03SStefano Zampini   PetscErrorCode ierr;
4294ed502f03SStefano Zampini 
4295ed502f03SStefano Zampini   PetscFunctionBegin;
4296ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4297ed502f03SStefano Zampini   PetscValidPointer(a,2);
4298ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4299ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4300ed502f03SStefano Zampini   *a = NULL;
4301ed502f03SStefano Zampini   PetscFunctionReturn(0);
4302ed502f03SStefano Zampini }
4303ed502f03SStefano Zampini 
4304ed502f03SStefano Zampini struct IJCompare4
4305ed502f03SStefano Zampini {
4306ed502f03SStefano Zampini   __host__ __device__
43072ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4308ed502f03SStefano Zampini   {
4309ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4310ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4311ed502f03SStefano Zampini     return false;
4312ed502f03SStefano Zampini   }
4313ed502f03SStefano Zampini };
4314ed502f03SStefano Zampini 
43158909a122SStefano Zampini struct Shift
43168909a122SStefano Zampini {
4317ed502f03SStefano Zampini   int _shift;
4318ed502f03SStefano Zampini 
4319ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4320ed502f03SStefano Zampini   __host__ __device__
4321ed502f03SStefano Zampini   inline int operator() (const int &c)
4322ed502f03SStefano Zampini   {
4323ed502f03SStefano Zampini     return c + _shift;
4324ed502f03SStefano Zampini   }
4325ed502f03SStefano Zampini };
4326ed502f03SStefano Zampini 
4327ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4328ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4329ed502f03SStefano Zampini {
4330ed502f03SStefano Zampini   PetscErrorCode               ierr;
4331ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4332ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4333ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4334ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4335ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4336ed502f03SStefano Zampini   cusparseStatus_t             stat;
4337ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4338ed502f03SStefano Zampini   cudaError_t                  cerr;
4339ed502f03SStefano Zampini 
4340ed502f03SStefano Zampini   PetscFunctionBegin;
4341ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4342ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4343ed502f03SStefano Zampini   PetscValidPointer(C,4);
4344ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4345ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4346ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4347ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4348ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4349ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4350ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4351ed502f03SStefano Zampini     m     = A->rmap->n;
4352ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4353ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4354ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4355ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4356ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4357ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4358ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4359ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4360ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4361ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4362ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4363ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4364ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4365ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4366ed502f03SStefano Zampini     Ccusp->nrows    = m;
4367ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4368ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4369ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4370ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4371ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4372ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4373ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4374ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4375ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4376ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4377ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4378ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4379ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4380ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4381ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4382ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4383ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4384ed502f03SStefano Zampini 
4385ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4386ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4387ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4388ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4389ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4390ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4391ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4392ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4393ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4394ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4395ed502f03SStefano Zampini     if (c->nz) {
43962ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
43972ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
43982ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
43992ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44002ed87e7eSStefano Zampini 
4401ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4402ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4403ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4404ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4405ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4406ed502f03SStefano Zampini         }
44072ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44082ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4409ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4410ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4411ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4412ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4413ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4414ed502f03SStefano Zampini         }
44152ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44162ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4417ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
44182ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44192ed87e7eSStefano Zampini                               Aroff->data().get(),
44202ed87e7eSStefano Zampini                               Annz,
44212ed87e7eSStefano Zampini                               m,
44222ed87e7eSStefano Zampini                               Acoo->data().get(),
44232ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4424ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44252ed87e7eSStefano Zampini                               Broff->data().get(),
4426ed502f03SStefano Zampini                               Bnnz,
4427ed502f03SStefano Zampini                               m,
44282ed87e7eSStefano Zampini                               Bcoo->data().get(),
4429ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44302ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44312ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44322ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44338909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4434ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4435ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44368909a122SStefano Zampini #else
44378909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44388909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44398909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44408909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44418909a122SStefano Zampini #endif
44422ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44432ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44442ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44452ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44462ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44472ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4448ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4449ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4450ed502f03SStefano Zampini       thrust::advance(p2,Annz);
44512ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
44528909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
44538909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
44548909a122SStefano Zampini #endif
44552ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
44562ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
44572ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
44582ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44592ed87e7eSStefano Zampini #else
44602ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
44612ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
44622ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
44632ed87e7eSStefano Zampini #endif
4464ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
44652ed87e7eSStefano Zampini                               Ccoo->data().get(),
4466ed502f03SStefano Zampini                               c->nz,
4467ed502f03SStefano Zampini                               m,
4468ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4469ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4470ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
44712ed87e7eSStefano Zampini       delete wPerm;
44722ed87e7eSStefano Zampini       delete Acoo;
44732ed87e7eSStefano Zampini       delete Bcoo;
44742ed87e7eSStefano Zampini       delete Ccoo;
4475ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4476ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4477ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4478ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4479ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4480ed502f03SStefano Zampini #endif
44811a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
44823606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
44833606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4484ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4485ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4486ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4487ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4488ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4489ed502f03SStefano Zampini 
44901a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
44911a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4492a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4493ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4494ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4495ed502f03SStefano Zampini         CcsrT->num_rows = n;
4496ed502f03SStefano Zampini         CcsrT->num_cols = m;
4497ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4498ed502f03SStefano Zampini 
4499ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4500ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4501ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4502ed502f03SStefano Zampini 
4503ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4504ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4505ed502f03SStefano Zampini         if (AT) {
4506ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4507ed502f03SStefano Zampini           thrust::advance(rT,-1);
4508ed502f03SStefano Zampini         }
4509ed502f03SStefano Zampini         if (BT) {
4510ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4511ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4512ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4513ed502f03SStefano Zampini         }
4514ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4515ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4516ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4517ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4518ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4519ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4520ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4521ed502f03SStefano Zampini 
4522ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4523ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4525ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4526ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4527ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4528ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4529ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4530ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4531ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4532ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4533ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4534ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4535ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4536ed502f03SStefano Zampini #endif
4537ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4538ed502f03SStefano Zampini       }
4539ed502f03SStefano Zampini     }
4540ed502f03SStefano Zampini 
4541ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4542ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4543ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4544ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4545ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4546ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4547ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4548ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4549ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4550ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4551ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4552ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4553ed502f03SStefano Zampini     } else {
4554ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4555ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4556ed502f03SStefano Zampini     }
4557ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4558ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4559ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4560ed502f03SStefano Zampini     c->maxnz = c->nz;
4561ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4562ed502f03SStefano Zampini     c->rmax = 0;
4563ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4564ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4565ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4566ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4567ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4568ed502f03SStefano Zampini     }
4569ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4570ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4571ed502f03SStefano Zampini     (*C)->nonzerostate++;
4572ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4573ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4574ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4575ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4576ed502f03SStefano Zampini   } else {
4577ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4578ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4579ed502f03SStefano Zampini     if (c->nz) {
4580ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4581ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4582ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4583ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4584ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4585ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4586ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4587ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4588ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4589ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4590ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4591ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4592ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4593ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4594ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4595ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4596ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4597ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4598ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4599ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4600ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4601ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4602ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4603ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4604ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4605ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4606ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4607ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4608ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4609a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
46101a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4611ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4612ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4613ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4614ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4615ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4616ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4617ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4618ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46191a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4620ed502f03SStefano Zampini       }
4621ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4622ed502f03SStefano Zampini     }
4623ed502f03SStefano Zampini   }
4624ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4625ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4626ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4627ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4628ed502f03SStefano Zampini   PetscFunctionReturn(0);
4629ed502f03SStefano Zampini }
4630c215019aSStefano Zampini 
4631c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4632c215019aSStefano Zampini {
4633c215019aSStefano Zampini   PetscErrorCode    ierr;
4634c215019aSStefano Zampini   bool              dmem;
4635c215019aSStefano Zampini   const PetscScalar *av;
4636c215019aSStefano Zampini   cudaError_t       cerr;
4637c215019aSStefano Zampini 
4638c215019aSStefano Zampini   PetscFunctionBegin;
4639c215019aSStefano Zampini   dmem = isCudaMem(v);
4640c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4641c215019aSStefano Zampini   if (n && idx) {
4642c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4643c215019aSStefano Zampini     widx.assign(idx,idx+n);
4644c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4645c215019aSStefano Zampini 
4646c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4647c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4648c215019aSStefano Zampini     if (dmem) {
4649c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4650c215019aSStefano Zampini     } else {
4651c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4652c215019aSStefano Zampini       dv = w->data();
4653c215019aSStefano Zampini     }
4654c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4655c215019aSStefano Zampini 
4656c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4657c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4658c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4659c215019aSStefano Zampini     if (w) {
4660c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4661c215019aSStefano Zampini     }
4662c215019aSStefano Zampini     delete w;
4663c215019aSStefano Zampini   } else {
4664c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4665c215019aSStefano Zampini   }
4666c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4667c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4668c215019aSStefano Zampini   PetscFunctionReturn(0);
4669c215019aSStefano Zampini }
4670