xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 5f101d05fc29035e9fefb950885df71a7b968e6a)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
16e8d2b73aSMark Adams 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8657181aedSStefano Zampini 
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
897e8381f9SStefano Zampini 
90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91c215019aSStefano Zampini 
92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93b06137fdSPaul Mullowney {
94b06137fdSPaul Mullowney   cusparseStatus_t   stat;
95b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney   PetscFunctionBegin;
98d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10057d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101b06137fdSPaul Mullowney   PetscFunctionReturn(0);
102b06137fdSPaul Mullowney }
103b06137fdSPaul Mullowney 
104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105b06137fdSPaul Mullowney {
106b06137fdSPaul Mullowney   cusparseStatus_t   stat;
107b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney   PetscFunctionBegin;
110d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1116b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11216a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11357d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11416a2e217SAlejandro Lamas Daviña     }
115b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1166b1cf21dSAlejandro Lamas Daviña   }
11757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118b06137fdSPaul Mullowney   PetscFunctionReturn(0);
119b06137fdSPaul Mullowney }
120b06137fdSPaul Mullowney 
121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122b06137fdSPaul Mullowney {
123b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1247e8381f9SStefano Zampini   PetscBool          flg;
1257e8381f9SStefano Zampini   PetscErrorCode     ierr;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1287e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
1569ae82921SPaul Mullowney   PetscErrorCode ierr;
157bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1589ae82921SPaul Mullowney 
1599ae82921SPaul Mullowney   PetscFunctionBegin;
160bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1622c7c0729SBarry Smith   (*B)->factortype = ftype;
1639ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642205254eSKarl Rupp 
165087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16633d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1679ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1689ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1694ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1704ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1714ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1754ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1779ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178bc3f50f2SPaul Mullowney 
179fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1804ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1813ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1829ae82921SPaul Mullowney   PetscFunctionReturn(0);
1839ae82921SPaul Mullowney }
1849ae82921SPaul Mullowney 
185bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186ca45077fSPaul Mullowney {
187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1886e111a19SKarl Rupp 
189ca45077fSPaul Mullowney   PetscFunctionBegin;
190ca45077fSPaul Mullowney   switch (op) {
191e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
192aa372e3fSPaul Mullowney     cusparsestruct->format = format;
193ca45077fSPaul Mullowney     break;
194e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
195aa372e3fSPaul Mullowney     cusparsestruct->format = format;
196ca45077fSPaul Mullowney     break;
197ca45077fSPaul Mullowney   default:
19836d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199ca45077fSPaul Mullowney   }
200ca45077fSPaul Mullowney   PetscFunctionReturn(0);
201ca45077fSPaul Mullowney }
2029ae82921SPaul Mullowney 
203e057df02SPaul Mullowney /*@
204e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
206aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
207e057df02SPaul Mullowney    Not Collective
208e057df02SPaul Mullowney 
209e057df02SPaul Mullowney    Input Parameters:
2108468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
21136d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2122692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213e057df02SPaul Mullowney 
214e057df02SPaul Mullowney    Output Parameter:
215e057df02SPaul Mullowney 
216e057df02SPaul Mullowney    Level: intermediate
217e057df02SPaul Mullowney 
2188468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219e057df02SPaul Mullowney @*/
220e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221e057df02SPaul Mullowney {
222e057df02SPaul Mullowney   PetscErrorCode ierr;
2236e111a19SKarl Rupp 
224e057df02SPaul Mullowney   PetscFunctionBegin;
225e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227e057df02SPaul Mullowney   PetscFunctionReturn(0);
228e057df02SPaul Mullowney }
229e057df02SPaul Mullowney 
2301a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231e6e9a74fSStefano Zampini {
232e6e9a74fSStefano Zampini   PetscErrorCode ierr;
233e6e9a74fSStefano Zampini 
234e6e9a74fSStefano Zampini   PetscFunctionBegin;
2351a2c6b5cSJunchao Zhang   switch (op) {
2361a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2371a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2381a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2391a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2401a2c6b5cSJunchao Zhang       break;
2411a2c6b5cSJunchao Zhang     default:
2421a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2431a2c6b5cSJunchao Zhang       break;
244e6e9a74fSStefano Zampini   }
245e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
246e6e9a74fSStefano Zampini }
247e6e9a74fSStefano Zampini 
248bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249bddcd29dSMark Adams 
250bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251bddcd29dSMark Adams {
252bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
254bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
255bddcd29dSMark Adams   PetscErrorCode ierr;
256bddcd29dSMark Adams 
257bddcd29dSMark Adams   PetscFunctionBegin;
258bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
261bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
262bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264bddcd29dSMark Adams   if (row_identity && col_identity) {
265bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267bddcd29dSMark Adams     B->ops->matsolve = NULL;
268bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
269bddcd29dSMark Adams   } else {
270bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272bddcd29dSMark Adams     B->ops->matsolve = NULL;
273bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
274bddcd29dSMark Adams   }
275bddcd29dSMark Adams 
276bddcd29dSMark Adams   /* get the triangular factors */
277bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278bddcd29dSMark Adams   PetscFunctionReturn(0);
279bddcd29dSMark Adams }
280bddcd29dSMark Adams 
2814416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2829ae82921SPaul Mullowney {
2839ae82921SPaul Mullowney   PetscErrorCode           ierr;
284e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2859ae82921SPaul Mullowney   PetscBool                flg;
286a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2876e111a19SKarl Rupp 
2889ae82921SPaul Mullowney   PetscFunctionBegin;
289e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2909ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
291e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294afb2bd1cSJunchao Zhang 
2954c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
303afb2bd1cSJunchao Zhang 
304afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
305afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
306afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
307afb2bd1cSJunchao Zhang 
308afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
309afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
310afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
311afb2bd1cSJunchao Zhang    #endif
3124c87dfd4SPaul Mullowney   }
3130af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3149ae82921SPaul Mullowney   PetscFunctionReturn(0);
3159ae82921SPaul Mullowney }
3169ae82921SPaul Mullowney 
3176fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3189ae82921SPaul Mullowney {
319da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3209ae82921SPaul Mullowney   PetscErrorCode               ierr;
3219ae82921SPaul Mullowney 
3229ae82921SPaul Mullowney   PetscFunctionBegin;
323da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3269ae82921SPaul Mullowney   PetscFunctionReturn(0);
3279ae82921SPaul Mullowney }
3289ae82921SPaul Mullowney 
3296fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3309ae82921SPaul Mullowney {
331da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3329ae82921SPaul Mullowney   PetscErrorCode               ierr;
3339ae82921SPaul Mullowney 
3349ae82921SPaul Mullowney   PetscFunctionBegin;
335da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3369ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3389ae82921SPaul Mullowney   PetscFunctionReturn(0);
3399ae82921SPaul Mullowney }
3409ae82921SPaul Mullowney 
341087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
342087f3262SPaul Mullowney {
343da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
344087f3262SPaul Mullowney   PetscErrorCode               ierr;
345087f3262SPaul Mullowney 
346087f3262SPaul Mullowney   PetscFunctionBegin;
347da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
348087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
349087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
350087f3262SPaul Mullowney   PetscFunctionReturn(0);
351087f3262SPaul Mullowney }
352087f3262SPaul Mullowney 
353087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
354087f3262SPaul Mullowney {
355da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
356087f3262SPaul Mullowney   PetscErrorCode               ierr;
357087f3262SPaul Mullowney 
358087f3262SPaul Mullowney   PetscFunctionBegin;
359da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
360087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
361087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
362087f3262SPaul Mullowney   PetscFunctionReturn(0);
363087f3262SPaul Mullowney }
364087f3262SPaul Mullowney 
365087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3669ae82921SPaul Mullowney {
3679ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3689ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3699ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
370aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3719ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3729ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3739ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3749ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3759ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
376b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
37757d48284SJunchao Zhang   cudaError_t                       cerr;
3789ae82921SPaul Mullowney 
3799ae82921SPaul Mullowney   PetscFunctionBegin;
380cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
381c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3829ae82921SPaul Mullowney     try {
3839ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3849ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
385da79fbbcSStefano Zampini       if (!loTriFactor) {
3862cbc15d9SMark         PetscScalar                       *AALo;
3872cbc15d9SMark 
3882cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
39157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3959ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3969ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3979ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3989ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3999ae82921SPaul Mullowney         v        = aa;
4009ae82921SPaul Mullowney         vi       = aj;
4019ae82921SPaul Mullowney         offset   = 1;
4029ae82921SPaul Mullowney         rowOffset= 1;
4039ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4049ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
405e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4069ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4079ae82921SPaul Mullowney           rowOffset += nz+1;
4089ae82921SPaul Mullowney 
409580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
410580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4119ae82921SPaul Mullowney 
4129ae82921SPaul Mullowney           offset      += nz;
4139ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4149ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4159ae82921SPaul Mullowney           offset      += 1;
4169ae82921SPaul Mullowney 
4179ae82921SPaul Mullowney           v  += nz;
4189ae82921SPaul Mullowney           vi += nz;
4199ae82921SPaul Mullowney         }
4202205254eSKarl Rupp 
421aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
422da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
423da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
424aa372e3fSPaul Mullowney         /* Create the matrix description */
42557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
42657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4271b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
428afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
429afb2bd1cSJunchao Zhang        #else
43057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
431afb2bd1cSJunchao Zhang        #endif
43257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
43357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
434aa372e3fSPaul Mullowney 
435aa372e3fSPaul Mullowney         /* set the operation */
436aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
437aa372e3fSPaul Mullowney 
438aa372e3fSPaul Mullowney         /* set the matrix */
439aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
440aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
441aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
442aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
443aa372e3fSPaul Mullowney 
444aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
445aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
446aa372e3fSPaul Mullowney 
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
449aa372e3fSPaul Mullowney 
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
452aa372e3fSPaul Mullowney 
453afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
454da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4561b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
458afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
459afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
460afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
461afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
462afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
463afb2bd1cSJunchao Zhang       #endif
464afb2bd1cSJunchao Zhang 
465aa372e3fSPaul Mullowney         /* perform the solve analysis */
466aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
467aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
468aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
469d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
4701b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
471d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
472d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
473d49cd2b7SBarry Smith                                #else
474d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
475afb2bd1cSJunchao Zhang                                #endif
476da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
477da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
478aa372e3fSPaul Mullowney 
479da79fbbcSStefano Zampini         /* assign the pointer */
480aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4812cbc15d9SMark         loTriFactor->AA_h = AALo;
48257d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
48357d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4844863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
485da79fbbcSStefano Zampini       } else { /* update values only */
4862cbc15d9SMark         if (!loTriFactor->AA_h) {
4872cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4882cbc15d9SMark         }
489da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4902cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
491da79fbbcSStefano Zampini         v        = aa;
492da79fbbcSStefano Zampini         vi       = aj;
493da79fbbcSStefano Zampini         offset   = 1;
494da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
495da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4962cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
497da79fbbcSStefano Zampini           offset      += nz;
4982cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
499da79fbbcSStefano Zampini           offset      += 1;
500da79fbbcSStefano Zampini           v  += nz;
501da79fbbcSStefano Zampini         }
5022cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
503da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
504da79fbbcSStefano Zampini       }
5059ae82921SPaul Mullowney     } catch(char *ex) {
5069ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5079ae82921SPaul Mullowney     }
5089ae82921SPaul Mullowney   }
5099ae82921SPaul Mullowney   PetscFunctionReturn(0);
5109ae82921SPaul Mullowney }
5119ae82921SPaul Mullowney 
512087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5139ae82921SPaul Mullowney {
5149ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5159ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5169ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
517aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5189ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5199ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5209ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5219ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5229ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5239ae82921SPaul Mullowney   PetscErrorCode                    ierr;
52457d48284SJunchao Zhang   cudaError_t                       cerr;
5259ae82921SPaul Mullowney 
5269ae82921SPaul Mullowney   PetscFunctionBegin;
527cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
528c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5299ae82921SPaul Mullowney     try {
5309ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5319ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
532da79fbbcSStefano Zampini       if (!upTriFactor) {
5332cbc15d9SMark         PetscScalar *AAUp;
5342cbc15d9SMark 
5352cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5362cbc15d9SMark 
5379ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
53857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
53957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5409ae82921SPaul Mullowney 
5419ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5429ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5439ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5449ae82921SPaul Mullowney         offset = nzUpper;
5459ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5469ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5479ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5489ae82921SPaul Mullowney 
549e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5509ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5519ae82921SPaul Mullowney 
552e057df02SPaul Mullowney           /* decrement the offset */
5539ae82921SPaul Mullowney           offset -= (nz+1);
5549ae82921SPaul Mullowney 
555e057df02SPaul Mullowney           /* first, set the diagonal elements */
5569ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
55709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5589ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5599ae82921SPaul Mullowney 
560580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
561580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5629ae82921SPaul Mullowney         }
5632205254eSKarl Rupp 
564aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
565da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
566da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5672205254eSKarl Rupp 
568aa372e3fSPaul Mullowney         /* Create the matrix description */
56957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
57057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5711b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
573afb2bd1cSJunchao Zhang        #else
57457d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
575afb2bd1cSJunchao Zhang        #endif
57657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
57757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
578aa372e3fSPaul Mullowney 
579aa372e3fSPaul Mullowney         /* set the operation */
580aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
581aa372e3fSPaul Mullowney 
582aa372e3fSPaul Mullowney         /* set the matrix */
583aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
584aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
585aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
586aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
587aa372e3fSPaul Mullowney 
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
589aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
590aa372e3fSPaul Mullowney 
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
592aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
593aa372e3fSPaul Mullowney 
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
595aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
596aa372e3fSPaul Mullowney 
597afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
598da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
599afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6001b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
601afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
602afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
603afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
604afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
605afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
606afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
607afb2bd1cSJunchao Zhang       #endif
608afb2bd1cSJunchao Zhang 
609aa372e3fSPaul Mullowney         /* perform the solve analysis */
610aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
611aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
612aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
613d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6141b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
615d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
616d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
617d49cd2b7SBarry Smith                                #else
618d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
619afb2bd1cSJunchao Zhang                                #endif
620da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
621da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
622aa372e3fSPaul Mullowney 
623da79fbbcSStefano Zampini         /* assign the pointer */
624aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6252cbc15d9SMark         upTriFactor->AA_h = AAUp;
62657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
62757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6284863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
629da79fbbcSStefano Zampini       } else {
6302cbc15d9SMark         if (!upTriFactor->AA_h) {
6312cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6322cbc15d9SMark         }
633da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
634da79fbbcSStefano Zampini         offset = nzUpper;
635da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
636da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
637da79fbbcSStefano Zampini 
638da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
639da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini           /* decrement the offset */
642da79fbbcSStefano Zampini           offset -= (nz+1);
643da79fbbcSStefano Zampini 
644da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6452cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6462cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
647da79fbbcSStefano Zampini         }
6482cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
649da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
650da79fbbcSStefano Zampini       }
6519ae82921SPaul Mullowney     } catch(char *ex) {
6529ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6539ae82921SPaul Mullowney     }
6549ae82921SPaul Mullowney   }
6559ae82921SPaul Mullowney   PetscFunctionReturn(0);
6569ae82921SPaul Mullowney }
6579ae82921SPaul Mullowney 
658087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6599ae82921SPaul Mullowney {
6609ae82921SPaul Mullowney   PetscErrorCode               ierr;
6619ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6629ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6639ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6649ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6659ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6669ae82921SPaul Mullowney 
6679ae82921SPaul Mullowney   PetscFunctionBegin;
668da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
669087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
670087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6712205254eSKarl Rupp 
672da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
673aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6749ae82921SPaul Mullowney 
675c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
676e057df02SPaul Mullowney   /* lower triangular indices */
6779ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
678da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
679da79fbbcSStefano Zampini     const PetscInt *r;
680da79fbbcSStefano Zampini 
681da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
682aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
683aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6849ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
685da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
686da79fbbcSStefano Zampini   }
6879ae82921SPaul Mullowney 
688e057df02SPaul Mullowney   /* upper triangular indices */
6899ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
690da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
691da79fbbcSStefano Zampini     const PetscInt *c;
692da79fbbcSStefano Zampini 
693da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
694aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
695aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6969ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
697da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
698da79fbbcSStefano Zampini   }
6999ae82921SPaul Mullowney   PetscFunctionReturn(0);
7009ae82921SPaul Mullowney }
7019ae82921SPaul Mullowney 
702087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
703087f3262SPaul Mullowney {
704087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
705087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
706aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
707aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
708087f3262SPaul Mullowney   cusparseStatus_t                  stat;
709087f3262SPaul Mullowney   PetscErrorCode                    ierr;
71057d48284SJunchao Zhang   cudaError_t                       cerr;
711087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
712087f3262SPaul Mullowney   PetscScalar                       *AAUp;
713087f3262SPaul Mullowney   PetscScalar                       *AALo;
714087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
715087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
716087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
717087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
718087f3262SPaul Mullowney 
719087f3262SPaul Mullowney   PetscFunctionBegin;
720cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
721c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
722087f3262SPaul Mullowney     try {
723da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
724da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
725da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
726087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
72757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
72857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
729087f3262SPaul Mullowney 
730087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
731087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
732087f3262SPaul Mullowney         AiUp[n]=nzUpper;
733087f3262SPaul Mullowney         offset = 0;
734087f3262SPaul Mullowney         for (i=0; i<n; i++) {
735087f3262SPaul Mullowney           /* set the pointers */
736087f3262SPaul Mullowney           v  = aa + ai[i];
737087f3262SPaul Mullowney           vj = aj + ai[i];
738087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
739087f3262SPaul Mullowney 
740087f3262SPaul Mullowney           /* first, set the diagonal elements */
741087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
74209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
743087f3262SPaul Mullowney           AiUp[i]      = offset;
74409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
745087f3262SPaul Mullowney 
746087f3262SPaul Mullowney           offset+=1;
747087f3262SPaul Mullowney           if (nz>0) {
748f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
749580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
750087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
751087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
752087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
753087f3262SPaul Mullowney             }
754087f3262SPaul Mullowney             offset+=nz;
755087f3262SPaul Mullowney           }
756087f3262SPaul Mullowney         }
757087f3262SPaul Mullowney 
758aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
759da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
760da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
761087f3262SPaul Mullowney 
762aa372e3fSPaul Mullowney         /* Create the matrix description */
76357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
76457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7651b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
766afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
767afb2bd1cSJunchao Zhang        #else
76857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
769afb2bd1cSJunchao Zhang        #endif
77057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
77157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
772087f3262SPaul Mullowney 
773aa372e3fSPaul Mullowney         /* set the matrix */
774aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
775aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
776aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
778aa372e3fSPaul Mullowney 
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
781aa372e3fSPaul Mullowney 
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
783aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
784aa372e3fSPaul Mullowney 
785aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
786aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
787aa372e3fSPaul Mullowney 
788afb2bd1cSJunchao Zhang         /* set the operation */
789afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
790afb2bd1cSJunchao Zhang 
791afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
792da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
793afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7941b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
795afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
796afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
797afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
798afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
799afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
800afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
801afb2bd1cSJunchao Zhang       #endif
802afb2bd1cSJunchao Zhang 
803aa372e3fSPaul Mullowney         /* perform the solve analysis */
804aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
805aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
806aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
807d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8081b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
809d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
810d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
811d49cd2b7SBarry Smith                                 #else
812d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
813afb2bd1cSJunchao Zhang                                 #endif
814da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816aa372e3fSPaul Mullowney 
817da79fbbcSStefano Zampini         /* assign the pointer */
818aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819aa372e3fSPaul Mullowney 
820aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
821da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823aa372e3fSPaul Mullowney 
824aa372e3fSPaul Mullowney         /* Create the matrix description */
82557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
82657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8271b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829afb2bd1cSJunchao Zhang        #else
83057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831afb2bd1cSJunchao Zhang        #endif
83257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834aa372e3fSPaul Mullowney 
835aa372e3fSPaul Mullowney         /* set the operation */
836aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney         /* set the matrix */
839aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
841aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852aa372e3fSPaul Mullowney 
853afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
854da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8561b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863afb2bd1cSJunchao Zhang       #endif
864afb2bd1cSJunchao Zhang 
865aa372e3fSPaul Mullowney         /* perform the solve analysis */
866aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
8701b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
872d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873d49cd2b7SBarry Smith                                 #else
874d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875afb2bd1cSJunchao Zhang                                 #endif
876da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878aa372e3fSPaul Mullowney 
879da79fbbcSStefano Zampini         /* assign the pointer */
880aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
881087f3262SPaul Mullowney 
882da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
88357d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
88457d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
885da79fbbcSStefano Zampini       } else {
886da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
887da79fbbcSStefano Zampini         offset = 0;
888da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
889da79fbbcSStefano Zampini           /* set the pointers */
890da79fbbcSStefano Zampini           v  = aa + ai[i];
891da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
892da79fbbcSStefano Zampini 
893da79fbbcSStefano Zampini           /* first, set the diagonal elements */
894da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
895da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
896da79fbbcSStefano Zampini 
897da79fbbcSStefano Zampini           offset+=1;
898da79fbbcSStefano Zampini           if (nz>0) {
899da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
900da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
901da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
902da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
903da79fbbcSStefano Zampini             }
904da79fbbcSStefano Zampini             offset+=nz;
905da79fbbcSStefano Zampini           }
906da79fbbcSStefano Zampini         }
907da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
910da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
911da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
912da79fbbcSStefano Zampini       }
91357d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
91457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
915087f3262SPaul Mullowney     } catch(char *ex) {
916087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
917087f3262SPaul Mullowney     }
918087f3262SPaul Mullowney   }
919087f3262SPaul Mullowney   PetscFunctionReturn(0);
920087f3262SPaul Mullowney }
921087f3262SPaul Mullowney 
922087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9239ae82921SPaul Mullowney {
9249ae82921SPaul Mullowney   PetscErrorCode               ierr;
925087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
926087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
927087f3262SPaul Mullowney   IS                           ip = a->row;
928087f3262SPaul Mullowney   PetscBool                    perm_identity;
929087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
930087f3262SPaul Mullowney 
931087f3262SPaul Mullowney   PetscFunctionBegin;
932da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
933087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
934da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
935aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
936aa372e3fSPaul Mullowney 
937da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
938da79fbbcSStefano Zampini 
939087f3262SPaul Mullowney   /* lower triangular indices */
940087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
941087f3262SPaul Mullowney   if (!perm_identity) {
9424e4bbfaaSStefano Zampini     IS             iip;
943da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9444e4bbfaaSStefano Zampini 
9454e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9464e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
947da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
948aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
949aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
950aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9514e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9524e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9534e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
954087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
955da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
956da79fbbcSStefano Zampini   }
957087f3262SPaul Mullowney   PetscFunctionReturn(0);
958087f3262SPaul Mullowney }
959087f3262SPaul Mullowney 
960087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961087f3262SPaul Mullowney {
962087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963087f3262SPaul Mullowney   IS             ip = b->row;
964087f3262SPaul Mullowney   PetscBool      perm_identity;
965b175d8bbSPaul Mullowney   PetscErrorCode ierr;
966087f3262SPaul Mullowney 
967087f3262SPaul Mullowney   PetscFunctionBegin;
96857181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
969087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
971087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
972087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973087f3262SPaul Mullowney   if (perm_identity) {
974087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9764e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9774e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
978087f3262SPaul Mullowney   } else {
979087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9814e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9824e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
983087f3262SPaul Mullowney   }
984087f3262SPaul Mullowney 
985087f3262SPaul Mullowney   /* get the triangular factors */
986087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987087f3262SPaul Mullowney   PetscFunctionReturn(0);
988087f3262SPaul Mullowney }
9899ae82921SPaul Mullowney 
990b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991bda325fcSPaul Mullowney {
992bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
998aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
999aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1000aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1001aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10021b0a6780SStefano Zampini   cudaError_t                       cerr;
1003da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1004b175d8bbSPaul Mullowney 
1005bda325fcSPaul Mullowney   PetscFunctionBegin;
1006aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1007da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009aa372e3fSPaul Mullowney 
1010aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1011aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1012aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* Create the matrix description */
101857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* set the operation */
1025aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026aa372e3fSPaul Mullowney 
1027aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1028aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1029afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035aa372e3fSPaul Mullowney 
1036aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1041afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1042afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1043afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1044afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10471b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048afb2bd1cSJunchao Zhang #endif
1049afb2bd1cSJunchao Zhang 
1050da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1054aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1055aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1056aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1057afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1061afb2bd1cSJunchao Zhang                         #else
1062afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1064afb2bd1cSJunchao Zhang                         #endif
1065da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067aa372e3fSPaul Mullowney 
1068afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1069da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078afb2bd1cSJunchao Zhang #endif
1079afb2bd1cSJunchao Zhang 
1080afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1081aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
10851b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1087d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1088d49cd2b7SBarry Smith                           #else
1089d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1090afb2bd1cSJunchao Zhang                           #endif
1091da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1092da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1093aa372e3fSPaul Mullowney 
1094da79fbbcSStefano Zampini   /* assign the pointer */
1095aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney   /*********************************************/
1098aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1099aa372e3fSPaul Mullowney   /*********************************************/
1100aa372e3fSPaul Mullowney 
1101aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1102da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1103da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1106aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1107aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1108aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1109aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1110aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1111aa372e3fSPaul Mullowney 
1112aa372e3fSPaul Mullowney   /* Create the matrix description */
111357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
111457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
111557d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
111657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
111757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1118aa372e3fSPaul Mullowney 
1119aa372e3fSPaul Mullowney   /* set the operation */
1120aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1121aa372e3fSPaul Mullowney 
1122aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1123aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1124afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1125afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1126aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1127afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1128afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1129afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1130aa372e3fSPaul Mullowney 
1131aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1132afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1133afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1134afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1135afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1136afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1137afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1138afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1139afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1141afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1142afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1143afb2bd1cSJunchao Zhang #endif
1144afb2bd1cSJunchao Zhang 
1145da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1146aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1147aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1148aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1149aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1150aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1151aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1152afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1154afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1155d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1156afb2bd1cSJunchao Zhang                         #else
1157afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1158d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1159afb2bd1cSJunchao Zhang                         #endif
1160d49cd2b7SBarry Smith 
1161da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1162da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1163aa372e3fSPaul Mullowney 
1164afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1165da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1166afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11671b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1172afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1173afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1174afb2bd1cSJunchao Zhang   #endif
1175afb2bd1cSJunchao Zhang 
1176afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1177aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1178afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1179afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1180d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
11811b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1182d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1183d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1184d49cd2b7SBarry Smith                           #else
1185d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1186afb2bd1cSJunchao Zhang                           #endif
1187d49cd2b7SBarry Smith 
1188da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1189da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1190aa372e3fSPaul Mullowney 
1191da79fbbcSStefano Zampini   /* assign the pointer */
1192aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1193bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1194bda325fcSPaul Mullowney }
1195bda325fcSPaul Mullowney 
1196a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1197a49f1ed0SStefano Zampini {
1198a49f1ed0SStefano Zampini   __host__ __device__
1199a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1200a49f1ed0SStefano Zampini   {
1201a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1202a49f1ed0SStefano Zampini   }
1203a49f1ed0SStefano Zampini };
1204a49f1ed0SStefano Zampini 
12051a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1206bda325fcSPaul Mullowney {
1207aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1208a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1209bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1210bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1211aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1212b06137fdSPaul Mullowney   cudaError_t                  err;
121385ba7357SStefano Zampini   PetscErrorCode               ierr;
1214b175d8bbSPaul Mullowney 
1215bda325fcSPaul Mullowney   PetscFunctionBegin;
12161a2c6b5cSJunchao Zhang   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1217a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1218a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1219e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1220a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1221e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12221a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
122385ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1225a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1226a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1227a49f1ed0SStefano Zampini   }
1228a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1229aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123057d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1231aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123257d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123357d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1234aa372e3fSPaul Mullowney 
1235b06137fdSPaul Mullowney     /* set alpha and beta */
1236afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12377656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12387656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1239afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12407656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12417656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1242b06137fdSPaul Mullowney 
1243aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1244aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1245a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1246554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1247554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1248aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1249a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1250aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1251aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1252a3fdcf43SKarl Rupp 
1253039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125481902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1255afb2bd1cSJunchao Zhang 
1256afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1258afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1259afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1260afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1261afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1262afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1263afb2bd1cSJunchao Zhang      #endif
1264aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1265afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1266afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1267afb2bd1cSJunchao Zhang    #else
1268aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
126951c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127051c6d536SStefano Zampini       /* First convert HYB to CSR */
1271aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1272aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1273aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1274aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1275aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1276aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1277aa372e3fSPaul Mullowney 
1278aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1279aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1280aa372e3fSPaul Mullowney                               temp->values->data().get(),
1281aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128257d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1283aa372e3fSPaul Mullowney 
1284aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1285aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1286aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1287aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1288aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1290aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1291aa372e3fSPaul Mullowney 
1292aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1293aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1294aa372e3fSPaul Mullowney                               temp->values->data().get(),
1295aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1296aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1297aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1298aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1299aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130057d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1303aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
130457d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1305aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1306aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1307aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1308aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1309aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1310aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131157d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1312aa372e3fSPaul Mullowney 
1313aa372e3fSPaul Mullowney       /* assign the pointer */
1314aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13151a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1316aa372e3fSPaul Mullowney       /* delete temporaries */
1317aa372e3fSPaul Mullowney       if (tempT) {
1318aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1319aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1320aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1321aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1322087f3262SPaul Mullowney       }
1323aa372e3fSPaul Mullowney       if (temp) {
1324aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1325aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1326aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1327aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1328aa372e3fSPaul Mullowney       }
1329afb2bd1cSJunchao Zhang      #endif
1330aa372e3fSPaul Mullowney     }
1331a49f1ed0SStefano Zampini   }
1332a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1333a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1334a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1335e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1336e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1337e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1338e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1339e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1340e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1341e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1342e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1343a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1344a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1345a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1346a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1347a49f1ed0SStefano Zampini     }
1348a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1349a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1350a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1351a49f1ed0SStefano Zampini 
1352a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1353a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1355a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1356a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1357a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1358a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1359a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1360a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1361a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1362a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1363a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1364a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1365a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1366a49f1ed0SStefano Zampini      #endif
1367a49f1ed0SStefano Zampini 
13681a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13691a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13701a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13711a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13721a2c6b5cSJunchao Zhang 
13731a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13741a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13751a2c6b5cSJunchao Zhang         */
13761a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13771a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13781a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13791a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13801a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1381a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1382a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1383a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1384a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13851a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1386a49f1ed0SStefano Zampini                              #else
1387a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
13881a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1389a49f1ed0SStefano Zampini                              #endif
13901a2c6b5cSJunchao Zhang       } else {
13911a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13921a2c6b5cSJunchao Zhang       }
13931a2c6b5cSJunchao Zhang 
1394a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1395a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1396a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1398a49f1ed0SStefano Zampini      #endif
1399a49f1ed0SStefano Zampini     }
1400a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1401a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1402a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1403a49f1ed0SStefano Zampini   }
1404ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
140585ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1406213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1407213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1408aa372e3fSPaul Mullowney   /* assign the pointer */
1409aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14101a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1411bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1412bda325fcSPaul Mullowney }
1413bda325fcSPaul Mullowney 
1414a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14156fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1416bda325fcSPaul Mullowney {
1417c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1418465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1419465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1420465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1421465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1422bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1423bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1424aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1427b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1428bda325fcSPaul Mullowney 
1429bda325fcSPaul Mullowney   PetscFunctionBegin;
1430aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1431aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1432bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1433aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1434aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1435bda325fcSPaul Mullowney   }
1436bda325fcSPaul Mullowney 
1437bda325fcSPaul Mullowney   /* Get the GPU pointers */
1438c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1439c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1440c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1441c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1442bda325fcSPaul Mullowney 
14437a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1444aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1445a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1446c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1447c41cb2e2SAlejandro Lamas Daviña                xGPU);
1448aa372e3fSPaul Mullowney 
1449aa372e3fSPaul Mullowney   /* First, solve U */
1450aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1451afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1454afb2bd1cSJunchao Zhang                       #endif
1455afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1456aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1457aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1458aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1459aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1460d49cd2b7SBarry Smith                         xarray,
14611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462d49cd2b7SBarry Smith                         tempGPU->data().get(),
1463d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1464d49cd2b7SBarry Smith                       #else
1465d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1466afb2bd1cSJunchao Zhang                       #endif
1467aa372e3fSPaul Mullowney 
1468aa372e3fSPaul Mullowney   /* Then, solve L */
1469aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1470afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1473afb2bd1cSJunchao Zhang                       #endif
1474afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1475aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1476aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1477aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1478aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1479d49cd2b7SBarry Smith                         tempGPU->data().get(),
14801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481d49cd2b7SBarry Smith                         xarray,
1482d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1483d49cd2b7SBarry Smith                       #else
1484d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1485afb2bd1cSJunchao Zhang                       #endif
1486aa372e3fSPaul Mullowney 
1487aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1488a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1489c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1490aa372e3fSPaul Mullowney                tempGPU->begin());
1491aa372e3fSPaul Mullowney 
1492aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1493a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1494bda325fcSPaul Mullowney 
1495bda325fcSPaul Mullowney   /* restore */
1496c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1497c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1498661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1499958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1500bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1501bda325fcSPaul Mullowney }
1502bda325fcSPaul Mullowney 
15036fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1504bda325fcSPaul Mullowney {
1505465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1506465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1507bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1508bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1509aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1510aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1511aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1512b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1513bda325fcSPaul Mullowney 
1514bda325fcSPaul Mullowney   PetscFunctionBegin;
1515aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1516aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1517bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1518aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1519aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1520bda325fcSPaul Mullowney   }
1521bda325fcSPaul Mullowney 
1522bda325fcSPaul Mullowney   /* Get the GPU pointers */
1523c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525bda325fcSPaul Mullowney 
15267a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527aa372e3fSPaul Mullowney   /* First, solve U */
1528aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1529afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15301b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1532afb2bd1cSJunchao Zhang                       #endif
1533afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1534aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1535aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1536aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1537aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1538d49cd2b7SBarry Smith                         barray,
15391b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540d49cd2b7SBarry Smith                         tempGPU->data().get(),
1541d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1542d49cd2b7SBarry Smith                       #else
1543d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1544afb2bd1cSJunchao Zhang                       #endif
1545aa372e3fSPaul Mullowney 
1546aa372e3fSPaul Mullowney   /* Then, solve L */
1547aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1548afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15491b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1550afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1551afb2bd1cSJunchao Zhang                       #endif
1552afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1553aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1554aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1555aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1556aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1557d49cd2b7SBarry Smith                         tempGPU->data().get(),
15581b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1559d49cd2b7SBarry Smith                         xarray,
1560d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1561d49cd2b7SBarry Smith                       #else
1562d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1563afb2bd1cSJunchao Zhang                       #endif
1564bda325fcSPaul Mullowney 
1565bda325fcSPaul Mullowney   /* restore */
1566c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1570bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1571bda325fcSPaul Mullowney }
1572bda325fcSPaul Mullowney 
15736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15749ae82921SPaul Mullowney {
1575465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1576465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1577465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1578465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15799ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15809ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1581aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1582aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1583aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1584b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
15859ae82921SPaul Mullowney 
15869ae82921SPaul Mullowney   PetscFunctionBegin;
1587ebc8f436SDominic Meiser 
1588e057df02SPaul Mullowney   /* Get the GPU pointers */
1589c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1590c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1591c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1592c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15939ae82921SPaul Mullowney 
15947a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1595aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1596a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1597c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15984e4bbfaaSStefano Zampini                tempGPU->begin());
1599aa372e3fSPaul Mullowney 
1600aa372e3fSPaul Mullowney   /* Next, solve L */
1601aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1602afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16031b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1605afb2bd1cSJunchao Zhang                       #endif
1606afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1607aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1608aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1609aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1610aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1611d49cd2b7SBarry Smith                         tempGPU->data().get(),
16121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613d49cd2b7SBarry Smith                          xarray,
1614d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1615d49cd2b7SBarry Smith                       #else
1616d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1617afb2bd1cSJunchao Zhang                       #endif
1618aa372e3fSPaul Mullowney 
1619aa372e3fSPaul Mullowney   /* Then, solve U */
1620aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1621afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16221b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1624afb2bd1cSJunchao Zhang                       #endif
1625afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1626aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1627aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1628aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1629d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16301b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1631d49cd2b7SBarry Smith                         tempGPU->data().get(),
1632d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1633d49cd2b7SBarry Smith                       #else
1634d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1635afb2bd1cSJunchao Zhang                       #endif
1636d49cd2b7SBarry Smith 
16374e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1638a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16394e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16404e4bbfaaSStefano Zampini                xGPU);
16419ae82921SPaul Mullowney 
1642c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1644661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1645958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16469ae82921SPaul Mullowney   PetscFunctionReturn(0);
16479ae82921SPaul Mullowney }
16489ae82921SPaul Mullowney 
16496fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16509ae82921SPaul Mullowney {
1651465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1652465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16539ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16549ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1655aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1656aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1657aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1658b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
16599ae82921SPaul Mullowney 
16609ae82921SPaul Mullowney   PetscFunctionBegin;
1661e057df02SPaul Mullowney   /* Get the GPU pointers */
1662c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1663c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16649ae82921SPaul Mullowney 
16657a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1666aa372e3fSPaul Mullowney   /* First, solve L */
1667aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1668afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16691b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1671afb2bd1cSJunchao Zhang                       #endif
1672afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1673aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1674aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1675aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1676aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1677d49cd2b7SBarry Smith                         barray,
16781b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679d49cd2b7SBarry Smith                         tempGPU->data().get(),
1680d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1681d49cd2b7SBarry Smith                       #else
1682d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1683afb2bd1cSJunchao Zhang                       #endif
1684d49cd2b7SBarry Smith 
1685aa372e3fSPaul Mullowney   /* Next, solve U */
1686aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16881b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1690afb2bd1cSJunchao Zhang                       #endif
1691afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1693aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1694aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1695aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1696d49cd2b7SBarry Smith                         tempGPU->data().get(),
16971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698d49cd2b7SBarry Smith                         xarray,
1699d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1700d49cd2b7SBarry Smith                       #else
1701d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1702afb2bd1cSJunchao Zhang                       #endif
17039ae82921SPaul Mullowney 
1704c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1705c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1706661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1707958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17089ae82921SPaul Mullowney   PetscFunctionReturn(0);
17099ae82921SPaul Mullowney }
17109ae82921SPaul Mullowney 
17117e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17127e8381f9SStefano Zampini {
17137e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17147e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17157e8381f9SStefano Zampini   cudaError_t        cerr;
17167e8381f9SStefano Zampini   PetscErrorCode     ierr;
17177e8381f9SStefano Zampini 
17187e8381f9SStefano Zampini   PetscFunctionBegin;
17197e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17207e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17217e8381f9SStefano Zampini 
17227e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17237e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17247e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17257e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17267e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17277e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17287e8381f9SStefano Zampini   }
17297e8381f9SStefano Zampini   PetscFunctionReturn(0);
17307e8381f9SStefano Zampini }
17317e8381f9SStefano Zampini 
17327e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17337e8381f9SStefano Zampini {
17347e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17357e8381f9SStefano Zampini   PetscErrorCode ierr;
17367e8381f9SStefano Zampini 
17377e8381f9SStefano Zampini   PetscFunctionBegin;
17387e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17397e8381f9SStefano Zampini   *array = a->a;
17407e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17417e8381f9SStefano Zampini   PetscFunctionReturn(0);
17427e8381f9SStefano Zampini }
17437e8381f9SStefano Zampini 
1744042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17459ae82921SPaul Mullowney {
1746aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17477c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17489ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1749213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17509ae82921SPaul Mullowney   PetscErrorCode               ierr;
1751aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1752abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1753b06137fdSPaul Mullowney   cudaError_t                  err;
17549ae82921SPaul Mullowney 
17559ae82921SPaul Mullowney   PetscFunctionBegin;
1756e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1757c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1758a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1759a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1760afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
176185ba7357SStefano Zampini 
1762e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
176385ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1764afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
176505035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17664863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
176785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1768a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
176934d6c7a5SJose E. Roman     } else {
1770abb89eb1SStefano Zampini       PetscInt nnz;
177185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17727c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1773a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17747c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
177581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1776a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1777a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17789ae82921SPaul Mullowney       try {
17799ae82921SPaul Mullowney         if (a->compressedrow.use) {
17809ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17819ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17829ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17839ae82921SPaul Mullowney         } else {
1784213423ffSJunchao Zhang           m    = A->rmap->n;
1785213423ffSJunchao Zhang           ii   = a->i;
1786e6e9a74fSStefano Zampini           ridx = NULL;
17879ae82921SPaul Mullowney         }
1788e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1789e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1790abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1791abb89eb1SStefano Zampini         else nnz = a->nz;
17929ae82921SPaul Mullowney 
179385ba7357SStefano Zampini         /* create cusparse matrix */
1794abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1795aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
179657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
179757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
179857d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17999ae82921SPaul Mullowney 
1800afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
18017656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
18027656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1803afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18047656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18057656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
180657d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1807b06137fdSPaul Mullowney 
1808aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1809aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1810aa372e3fSPaul Mullowney           /* set the matrix */
1811afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1812afb2bd1cSJunchao Zhang           mat->num_rows = m;
1813afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1814abb89eb1SStefano Zampini           mat->num_entries = nnz;
1815afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1816afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18179ae82921SPaul Mullowney 
1818abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1819abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1820aa372e3fSPaul Mullowney 
1821abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1822abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1823aa372e3fSPaul Mullowney 
1824aa372e3fSPaul Mullowney           /* assign the pointer */
1825afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1826afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1827afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1828afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1829afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1830afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1831afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1832afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1833afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1834afb2bd1cSJunchao Zhang           }
1835afb2bd1cSJunchao Zhang          #endif
1836aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1837afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1838afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1839afb2bd1cSJunchao Zhang          #else
1840afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1841afb2bd1cSJunchao Zhang           mat->num_rows = m;
1842afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1843abb89eb1SStefano Zampini           mat->num_entries = nnz;
1844afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1845afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1846aa372e3fSPaul Mullowney 
1847abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1848abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1849aa372e3fSPaul Mullowney 
1850abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1851abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1852aa372e3fSPaul Mullowney 
1853aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
185457d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1855aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1856aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1857afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1858afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1859afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1860afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
186157d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1862aa372e3fSPaul Mullowney           /* assign the pointer */
1863aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1864aa372e3fSPaul Mullowney 
1865afb2bd1cSJunchao Zhang           if (mat) {
1866afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1867afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1868afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1869afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1870087f3262SPaul Mullowney           }
1871afb2bd1cSJunchao Zhang          #endif
1872087f3262SPaul Mullowney         }
1873ca45077fSPaul Mullowney 
1874aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1875213423ffSJunchao Zhang         if (a->compressedrow.use) {
1876213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1877aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1878aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1879213423ffSJunchao Zhang           tmp = m;
1880213423ffSJunchao Zhang         } else {
1881213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1882213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1883213423ffSJunchao Zhang           tmp = 0;
1884213423ffSJunchao Zhang         }
1885213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1886aa372e3fSPaul Mullowney 
1887aa372e3fSPaul Mullowney         /* assign the pointer */
1888aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18899ae82921SPaul Mullowney       } catch(char *ex) {
18909ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18919ae82921SPaul Mullowney       }
189205035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
189385ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
189434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
189534d6c7a5SJose E. Roman     }
1896abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18979ae82921SPaul Mullowney   }
18989ae82921SPaul Mullowney   PetscFunctionReturn(0);
18999ae82921SPaul Mullowney }
19009ae82921SPaul Mullowney 
1901c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1902aa372e3fSPaul Mullowney {
1903aa372e3fSPaul Mullowney   template <typename Tuple>
1904aa372e3fSPaul Mullowney   __host__ __device__
1905aa372e3fSPaul Mullowney   void operator()(Tuple t)
1906aa372e3fSPaul Mullowney   {
1907aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1908aa372e3fSPaul Mullowney   }
1909aa372e3fSPaul Mullowney };
1910aa372e3fSPaul Mullowney 
19117e8381f9SStefano Zampini struct VecCUDAEquals
19127e8381f9SStefano Zampini {
19137e8381f9SStefano Zampini   template <typename Tuple>
19147e8381f9SStefano Zampini   __host__ __device__
19157e8381f9SStefano Zampini   void operator()(Tuple t)
19167e8381f9SStefano Zampini   {
19177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19187e8381f9SStefano Zampini   }
19197e8381f9SStefano Zampini };
19207e8381f9SStefano Zampini 
1921e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1922e6e9a74fSStefano Zampini {
1923e6e9a74fSStefano Zampini   template <typename Tuple>
1924e6e9a74fSStefano Zampini   __host__ __device__
1925e6e9a74fSStefano Zampini   void operator()(Tuple t)
1926e6e9a74fSStefano Zampini   {
1927e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1928e6e9a74fSStefano Zampini   }
1929e6e9a74fSStefano Zampini };
1930e6e9a74fSStefano Zampini 
1931afb2bd1cSJunchao Zhang struct MatMatCusparse {
1932ccdfe979SStefano Zampini   PetscBool             cisdense;
1933ccdfe979SStefano Zampini   PetscScalar           *Bt;
1934ccdfe979SStefano Zampini   Mat                   X;
1935fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1936fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1937fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1938b4285af6SJunchao Zhang 
1939afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1940fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1941afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1942afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1943afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1944afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1945b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1946b4285af6SJunchao Zhang   void                  *dBuffer4;
1947b4285af6SJunchao Zhang   void                  *dBuffer5;
1948b4285af6SJunchao Zhang  #endif
1949fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1950fcdce8c4SStefano Zampini   void                  *mmBuffer;
1951fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1952fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1953afb2bd1cSJunchao Zhang #endif
1954afb2bd1cSJunchao Zhang };
1955ccdfe979SStefano Zampini 
1956ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1957ccdfe979SStefano Zampini {
1958ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1959ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1960ccdfe979SStefano Zampini   cudaError_t      cerr;
1961fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1963fcdce8c4SStefano Zampini  #endif
1964ccdfe979SStefano Zampini 
1965ccdfe979SStefano Zampini   PetscFunctionBegin;
1966ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1967fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1968afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1969fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1970afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1971afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1972fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1973b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1974b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1975b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
1976b4285af6SJunchao Zhang  #endif
1977b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1978b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1979afb2bd1cSJunchao Zhang  #endif
1980ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1981ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1982ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1983ccdfe979SStefano Zampini }
1984ccdfe979SStefano Zampini 
1985ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1986ccdfe979SStefano Zampini 
1987ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1988ccdfe979SStefano Zampini {
1989ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1990ccdfe979SStefano Zampini   Mat                          A,B;
1991afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1992ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1993ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1994ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1995ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1996ccdfe979SStefano Zampini   const PetscScalar            *barray;
1997ccdfe979SStefano Zampini   PetscScalar                  *carray;
1998ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1999ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2000ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2001ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2002ccdfe979SStefano Zampini 
2003ccdfe979SStefano Zampini   PetscFunctionBegin;
2004ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2005e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2006ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2007ccdfe979SStefano Zampini   A    = product->A;
2008ccdfe979SStefano Zampini   B    = product->B;
2009ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2010e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2011ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2012ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2013ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2014ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2015ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2016ccdfe979SStefano Zampini   switch (product->type) {
2017ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2018ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2019ccdfe979SStefano Zampini     mat = cusp->mat;
2020ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2021ccdfe979SStefano Zampini     m   = A->rmap->n;
2022ccdfe979SStefano Zampini     n   = B->cmap->n;
2023ccdfe979SStefano Zampini     break;
2024ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20251a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2026e6e9a74fSStefano Zampini       mat = cusp->mat;
2027e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2028e6e9a74fSStefano Zampini     } else {
20291a2c6b5cSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2030ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2031ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2032e6e9a74fSStefano Zampini     }
2033ccdfe979SStefano Zampini     m = A->cmap->n;
2034ccdfe979SStefano Zampini     n = B->cmap->n;
2035ccdfe979SStefano Zampini     break;
2036ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2037ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2038ccdfe979SStefano Zampini     mat = cusp->mat;
2039ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2040ccdfe979SStefano Zampini     m   = A->rmap->n;
2041ccdfe979SStefano Zampini     n   = B->rmap->n;
2042ccdfe979SStefano Zampini     break;
2043ccdfe979SStefano Zampini   default:
2044e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2045ccdfe979SStefano Zampini   }
2046e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2047ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2048ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2049ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2050afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2051ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2052afb2bd1cSJunchao Zhang 
2053ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2054c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2055c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2056c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2057c8378d12SStefano Zampini   } else {
2058c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2059c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2060c8378d12SStefano Zampini   }
2061c8378d12SStefano Zampini 
2062c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2063afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2065a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2066afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2067fcdce8c4SStefano Zampini     size_t mmBufferSize;
2068afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2069afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2070afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2071afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2072afb2bd1cSJunchao Zhang     }
2073c8378d12SStefano Zampini 
2074afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2075afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2076afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2077afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2078afb2bd1cSJunchao Zhang     }
2079afb2bd1cSJunchao Zhang 
2080afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2081afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2082afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2083afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2084afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2085afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2086afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2087afb2bd1cSJunchao Zhang     }
2088afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2089afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2090afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2091fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2092fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2093ee7b52eaSHong Zhang       cudaError_t cerr;
2094fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2095fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2096fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2097fcdce8c4SStefano Zampini     }
2098afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2099afb2bd1cSJunchao Zhang   } else {
2100afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2101afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2102afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2103afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2104afb2bd1cSJunchao Zhang   }
2105afb2bd1cSJunchao Zhang 
2106afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2107afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2108afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2109afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2110fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2111afb2bd1cSJunchao Zhang  #else
2112afb2bd1cSJunchao Zhang   PetscInt k;
2113afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2114ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2115ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2116ccdfe979SStefano Zampini     cublasStatus_t cerr;
2117ccdfe979SStefano Zampini 
2118ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2119ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2120ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2121ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2122ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2123ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2124ccdfe979SStefano Zampini     blda = B->cmap->n;
2125afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2126afb2bd1cSJunchao Zhang   } else {
2127afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2128ccdfe979SStefano Zampini   }
2129ccdfe979SStefano Zampini 
2130afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2131ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2132afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2133ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2134ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2135ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2136ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2137ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2138afb2bd1cSJunchao Zhang  #endif
2139c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2140c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2141ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2142ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2143ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2144ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2145ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2146ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2147ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2148ccdfe979SStefano Zampini   } else {
2149ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2150ccdfe979SStefano Zampini   }
2151ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2152ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2153ccdfe979SStefano Zampini   }
2154ccdfe979SStefano Zampini   if (!biscuda) {
2155ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2156ccdfe979SStefano Zampini   }
2157ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2158ccdfe979SStefano Zampini }
2159ccdfe979SStefano Zampini 
2160ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2161ccdfe979SStefano Zampini {
2162ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2163ccdfe979SStefano Zampini   Mat                A,B;
2164ccdfe979SStefano Zampini   PetscInt           m,n;
2165ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2166ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2167ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2168ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2169ccdfe979SStefano Zampini 
2170ccdfe979SStefano Zampini   PetscFunctionBegin;
2171ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2172e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2173ccdfe979SStefano Zampini   A    = product->A;
2174ccdfe979SStefano Zampini   B    = product->B;
2175ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2176e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2177ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2178e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2179ccdfe979SStefano Zampini   switch (product->type) {
2180ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2181ccdfe979SStefano Zampini     m = A->rmap->n;
2182ccdfe979SStefano Zampini     n = B->cmap->n;
2183ccdfe979SStefano Zampini     break;
2184ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2185ccdfe979SStefano Zampini     m = A->cmap->n;
2186ccdfe979SStefano Zampini     n = B->cmap->n;
2187ccdfe979SStefano Zampini     break;
2188ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2189ccdfe979SStefano Zampini     m = A->rmap->n;
2190ccdfe979SStefano Zampini     n = B->rmap->n;
2191ccdfe979SStefano Zampini     break;
2192ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2193ccdfe979SStefano Zampini     m = B->cmap->n;
2194ccdfe979SStefano Zampini     n = B->cmap->n;
2195ccdfe979SStefano Zampini     break;
2196ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2197ccdfe979SStefano Zampini     m = B->rmap->n;
2198ccdfe979SStefano Zampini     n = B->rmap->n;
2199ccdfe979SStefano Zampini     break;
2200ccdfe979SStefano Zampini   default:
2201e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2202ccdfe979SStefano Zampini   }
2203ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2204ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2205ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2206ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2207ccdfe979SStefano Zampini 
2208ccdfe979SStefano Zampini   /* product data */
2209ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2210ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2211afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2212afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2213ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2214afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2215ccdfe979SStefano Zampini   }
2216afb2bd1cSJunchao Zhang  #endif
2217ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2218ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2219ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2220ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2221ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2222ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2223ccdfe979SStefano Zampini     } else {
2224ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2225ccdfe979SStefano Zampini     }
2226ccdfe979SStefano Zampini   }
2227ccdfe979SStefano Zampini   C->product->data    = mmdata;
2228ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2229ccdfe979SStefano Zampini 
2230ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2231ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2232ccdfe979SStefano Zampini }
2233ccdfe979SStefano Zampini 
2234fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2235ccdfe979SStefano Zampini {
2236ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2237fcdce8c4SStefano Zampini   Mat                          A,B;
2238fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2239fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2240fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2241fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2242fcdce8c4SStefano Zampini   PetscBool                    flg;
2243ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2244fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2245fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2246fcdce8c4SStefano Zampini   MatProductType               ptype;
2247fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2248fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2249fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2250fcdce8c4SStefano Zampini #endif
2251b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2252ccdfe979SStefano Zampini 
2253ccdfe979SStefano Zampini   PetscFunctionBegin;
2254ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2255e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2256fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2257e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2258fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2259fcdce8c4SStefano Zampini   A = product->A;
2260fcdce8c4SStefano Zampini   B = product->B;
2261fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2262fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2263fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2264e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2265fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2266e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2267fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2268e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2269fcdce8c4SStefano Zampini     goto finalize;
2270fcdce8c4SStefano Zampini   }
2271fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2272fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2273e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2274fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2275e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2276fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2277fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2278fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2279fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2280fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2281e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2282e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2283e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2284fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2285fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2286fcdce8c4SStefano Zampini 
2287fcdce8c4SStefano Zampini   ptype = product->type;
2288fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2289fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2290fcdce8c4SStefano Zampini   switch (ptype) {
2291fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2292fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2293fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2294fcdce8c4SStefano Zampini     break;
2295fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2296fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2297fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2298fcdce8c4SStefano Zampini     break;
2299fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2300fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2301fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2302fcdce8c4SStefano Zampini     break;
2303fcdce8c4SStefano Zampini   default:
2304e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2305fcdce8c4SStefano Zampini   }
2306fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2307e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2308e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2309e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2310fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2311fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2312fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2313e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2314e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2315e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2316fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2317fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2319b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2320b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2321b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2322b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2323b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2324b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2325b4285af6SJunchao Zhang   #else
2326b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2327fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2328fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2329fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2330b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2331fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2332fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2333b4285af6SJunchao Zhang   #endif
2334fcdce8c4SStefano Zampini #else
2335b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2336fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2337fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2338fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2339fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2340fcdce8c4SStefano Zampini #endif
2341fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2342fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2343fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2344fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2345fcdce8c4SStefano Zampini finalize:
2346fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2347fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2348fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2349fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2350fcdce8c4SStefano Zampini   c->reallocs         = 0;
2351fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2352fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2353fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2354fcdce8c4SStefano Zampini   C->num_ass++;
2355ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2356ccdfe979SStefano Zampini }
2357fcdce8c4SStefano Zampini 
2358fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359fcdce8c4SStefano Zampini {
2360fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2361fcdce8c4SStefano Zampini   Mat                          A,B;
2362fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2364fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2367fcdce8c4SStefano Zampini   PetscBool                    flg;
2368fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2369fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2370fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2371fcdce8c4SStefano Zampini   MatProductType               ptype;
2372fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2373fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2374fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2375fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2376fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2377fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2378fcdce8c4SStefano Zampini #else
2379fcdce8c4SStefano Zampini   int                          cnz;
2380fcdce8c4SStefano Zampini #endif
2381b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2382fcdce8c4SStefano Zampini 
2383fcdce8c4SStefano Zampini   PetscFunctionBegin;
2384fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2385e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2386fcdce8c4SStefano Zampini   A    = product->A;
2387fcdce8c4SStefano Zampini   B    = product->B;
2388fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2389e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2390fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2391e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2392fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2393fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2394fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2395fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2396e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2397e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2398fcdce8c4SStefano Zampini 
2399fcdce8c4SStefano Zampini   /* product data */
2400fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2401fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2402fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2403fcdce8c4SStefano Zampini 
2404fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2405fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2406fcdce8c4SStefano Zampini   ptype = product->type;
2407fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2408fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2409fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2410fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2411fcdce8c4SStefano Zampini   switch (ptype) {
2412fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2413fcdce8c4SStefano Zampini     m = A->rmap->n;
2414fcdce8c4SStefano Zampini     n = B->cmap->n;
2415fcdce8c4SStefano Zampini     k = A->cmap->n;
2416fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2417fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2418fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2419fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2420fcdce8c4SStefano Zampini     break;
2421fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2422fcdce8c4SStefano Zampini     m = A->cmap->n;
2423fcdce8c4SStefano Zampini     n = B->cmap->n;
2424fcdce8c4SStefano Zampini     k = A->rmap->n;
24251a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2426fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2427fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2428fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2429fcdce8c4SStefano Zampini     break;
2430fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2431fcdce8c4SStefano Zampini     m = A->rmap->n;
2432fcdce8c4SStefano Zampini     n = B->rmap->n;
2433fcdce8c4SStefano Zampini     k = A->cmap->n;
24341a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2435fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2436fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2437fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2438fcdce8c4SStefano Zampini     break;
2439fcdce8c4SStefano Zampini   default:
2440e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2441fcdce8c4SStefano Zampini   }
2442fcdce8c4SStefano Zampini 
2443fcdce8c4SStefano Zampini   /* create cusparse matrix */
2444fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2445fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2446fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2447fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2448fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2449fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2450fcdce8c4SStefano Zampini 
2451fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2452fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2453fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2454fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2455fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2456fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2457fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2458fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2459fcdce8c4SStefano Zampini   } else {
2460fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2461fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2462fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2463fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2464fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2465fcdce8c4SStefano Zampini   }
2466fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2467fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2468fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2469fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2470fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2471fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2472fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2473fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2474fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2475fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2476fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2477fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2478fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2479fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2480fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2481fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2482fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2483fcdce8c4SStefano Zampini     c->nz = 0;
2484fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2485fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2486fcdce8c4SStefano Zampini     goto finalizesym;
2487fcdce8c4SStefano Zampini   }
2488fcdce8c4SStefano Zampini 
2489e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2490e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2491fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2492fcdce8c4SStefano Zampini   if (!biscompressed) {
2493fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2494fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2496fcdce8c4SStefano Zampini #endif
2497fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2498fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2499fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2500fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2501fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2502fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2503fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2504fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2505fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2506fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2507fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2508fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2509fcdce8c4SStefano Zampini     }
2510fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2511fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2512fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2513fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2514fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2515fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2516fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2517fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2518fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2519fcdce8c4SStefano Zampini     }
2520fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2521fcdce8c4SStefano Zampini #endif
2522fcdce8c4SStefano Zampini   }
2523e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2524e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2525fcdce8c4SStefano Zampini   /* precompute flops count */
2526fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2527fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2528fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2529fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2530fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2531fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2532fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2533fcdce8c4SStefano Zampini       }
2534fcdce8c4SStefano Zampini     }
2535fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2536fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2537fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2538fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2539fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2540fcdce8c4SStefano Zampini     }
2541fcdce8c4SStefano Zampini   } else { /* TODO */
2542fcdce8c4SStefano Zampini     flops = 0.;
2543fcdce8c4SStefano Zampini   }
2544fcdce8c4SStefano Zampini 
2545fcdce8c4SStefano Zampini   mmdata->flops = flops;
2546fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2547b4285af6SJunchao Zhang 
2548fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2550fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2551fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2552fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2553fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2554fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2555b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2556b4285af6SJunchao Zhang  {
2557b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2558b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2559b4285af6SJunchao Zhang   */
2560b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2561b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2562b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2563b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2564b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2565b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2566b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2567b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2568b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2569b4285af6SJunchao Zhang 
2570b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2571b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2572b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2573b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2574b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2575b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2576b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2577b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2578b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2579b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2580b4285af6SJunchao Zhang 
2581b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2582b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2583b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2584b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2585b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2586b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2587b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2588b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2589b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2590b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2591b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2592b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2593b4285af6SJunchao Zhang 
2594b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2595b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2596b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2597b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2598b4285af6SJunchao Zhang   /* allocate matrix C */
2599b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2600b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2601b4285af6SJunchao Zhang   /* update matC with the new pointers */
2602b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2603b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2604b4285af6SJunchao Zhang 
2605b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2606b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2607b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2608b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2609b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2610b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2611b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2612b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2613b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2614b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2615b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2616b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2617b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2618b4285af6SJunchao Zhang   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2619b4285af6SJunchao Zhang  }
2620b4285af6SJunchao Zhang  #else // ~PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2621b4285af6SJunchao Zhang   size_t bufSize2;
2622fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2623b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2624fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2625fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2626fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2627bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2628fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2629b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2630fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2631fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2632fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2633fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2634b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2635fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2636fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2637fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2638fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2639fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2640fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2641fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2642fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2643bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2644fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2645b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2646fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2647fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2648fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2649fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2650fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2651fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
265200702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2653fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2654fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2655fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2656fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2657fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2658fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2659b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2660fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2662b4285af6SJunchao Zhang  #endif
2663fcdce8c4SStefano Zampini #else
2664fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2665b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2666fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2667fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2668fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2669fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2670fcdce8c4SStefano Zampini   c->nz = cnz;
2671fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2672fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2673fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2674fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2675fcdce8c4SStefano Zampini 
2676fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2677fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2678fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2679fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2680b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2681fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2682fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2683fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2684fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2685fcdce8c4SStefano Zampini #endif
2686fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2687fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2688fcdce8c4SStefano Zampini finalizesym:
2689fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2690fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2691fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2692fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2693fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2694fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2695fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2696fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2697fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2698fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2699fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2700fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2701fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2702fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2703fcdce8c4SStefano Zampini   } else {
2704fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2705fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2706fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2707fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2708fcdce8c4SStefano Zampini   }
2709fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2710fcdce8c4SStefano Zampini     PetscInt r = 0;
2711fcdce8c4SStefano Zampini     c->i[0] = 0;
2712fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2713fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2714fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2715fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2716fcdce8c4SStefano Zampini     }
2717fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2718fcdce8c4SStefano Zampini   }
2719fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2720fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2721fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2722fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2723fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2724fcdce8c4SStefano Zampini   c->rmax = 0;
2725fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2726fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2727fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2728fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2729fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2730fcdce8c4SStefano Zampini   }
2731fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2732fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2733fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2734fcdce8c4SStefano Zampini 
2735fcdce8c4SStefano Zampini   C->nonzerostate++;
2736fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2737fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2738fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2739fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2740fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2741fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2742fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2743abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2744fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2745fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2746fcdce8c4SStefano Zampini   }
2747fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2748fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2749fcdce8c4SStefano Zampini }
2750fcdce8c4SStefano Zampini 
2751fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2752fcdce8c4SStefano Zampini 
2753fcdce8c4SStefano Zampini /* handles sparse or dense B */
2754fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2755fcdce8c4SStefano Zampini {
2756fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2757fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2758fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2759fcdce8c4SStefano Zampini 
2760fcdce8c4SStefano Zampini   PetscFunctionBegin;
2761fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2762fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2763abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2764fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2765fcdce8c4SStefano Zampini   }
2766fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2767fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2768fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2769fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2770fcdce8c4SStefano Zampini     }
2771fcdce8c4SStefano Zampini   }
277265e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
277365e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
277465e4b4d4SStefano Zampini     switch (product->type) {
277565e4b4d4SStefano Zampini     case MATPRODUCT_AB:
277665e4b4d4SStefano Zampini       if (product->api_user) {
277765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
277865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
277965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
278065e4b4d4SStefano Zampini       } else {
278165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
278265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
278365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
278465e4b4d4SStefano Zampini       }
278565e4b4d4SStefano Zampini       break;
278665e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
278765e4b4d4SStefano Zampini       if (product->api_user) {
278865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
278965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
279065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
279165e4b4d4SStefano Zampini       } else {
279265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
279365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
279465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
279565e4b4d4SStefano Zampini       }
279665e4b4d4SStefano Zampini       break;
279765e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
279865e4b4d4SStefano Zampini       if (product->api_user) {
279965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
280065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
280165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
280265e4b4d4SStefano Zampini       } else {
280365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
280465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
280565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
280665e4b4d4SStefano Zampini       }
280765e4b4d4SStefano Zampini       break;
280865e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
280965e4b4d4SStefano Zampini       if (product->api_user) {
281065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
281165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
281265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
281365e4b4d4SStefano Zampini       } else {
281465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
281565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
281665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
281765e4b4d4SStefano Zampini       }
281865e4b4d4SStefano Zampini       break;
281965e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
282065e4b4d4SStefano Zampini       if (product->api_user) {
282165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
282265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
282365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
282465e4b4d4SStefano Zampini       } else {
282565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
282665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
282765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
282865e4b4d4SStefano Zampini       }
282965e4b4d4SStefano Zampini       break;
283065e4b4d4SStefano Zampini     default:
283165e4b4d4SStefano Zampini       break;
283265e4b4d4SStefano Zampini     }
283365e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
283465e4b4d4SStefano Zampini   }
283565e4b4d4SStefano Zampini   /* dispatch */
2836fcdce8c4SStefano Zampini   if (isdense) {
2837ccdfe979SStefano Zampini     switch (product->type) {
2838ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2839ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2840ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2841ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2842ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2843fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2844fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2845fcdce8c4SStefano Zampini       } else {
2846fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2847fcdce8c4SStefano Zampini       }
2848fcdce8c4SStefano Zampini       break;
2849fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2850fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2851fcdce8c4SStefano Zampini       break;
2852ccdfe979SStefano Zampini     default:
2853ccdfe979SStefano Zampini       break;
2854ccdfe979SStefano Zampini     }
2855fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2856fcdce8c4SStefano Zampini     switch (product->type) {
2857fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2858fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2859fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2860fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2861fcdce8c4SStefano Zampini       break;
2862fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2863fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2864fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2865fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2866fcdce8c4SStefano Zampini       break;
2867fcdce8c4SStefano Zampini     default:
2868fcdce8c4SStefano Zampini       break;
2869fcdce8c4SStefano Zampini     }
2870fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2871fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2872fcdce8c4SStefano Zampini   }
2873ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2874ccdfe979SStefano Zampini }
2875ccdfe979SStefano Zampini 
28766fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
28779ae82921SPaul Mullowney {
2878b175d8bbSPaul Mullowney   PetscErrorCode ierr;
28799ae82921SPaul Mullowney 
28809ae82921SPaul Mullowney   PetscFunctionBegin;
2881e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2882e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2883e6e9a74fSStefano Zampini }
2884e6e9a74fSStefano Zampini 
2885e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2886e6e9a74fSStefano Zampini {
2887e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2888e6e9a74fSStefano Zampini 
2889e6e9a74fSStefano Zampini   PetscFunctionBegin;
2890e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2891e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2892e6e9a74fSStefano Zampini }
2893e6e9a74fSStefano Zampini 
2894e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2895e6e9a74fSStefano Zampini {
2896e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2897e6e9a74fSStefano Zampini 
2898e6e9a74fSStefano Zampini   PetscFunctionBegin;
2899e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2900e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2901e6e9a74fSStefano Zampini }
2902e6e9a74fSStefano Zampini 
2903e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2904e6e9a74fSStefano Zampini {
2905e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2906e6e9a74fSStefano Zampini 
2907e6e9a74fSStefano Zampini   PetscFunctionBegin;
2908e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
29099ae82921SPaul Mullowney   PetscFunctionReturn(0);
29109ae82921SPaul Mullowney }
29119ae82921SPaul Mullowney 
29126fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2913ca45077fSPaul Mullowney {
2914b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2915ca45077fSPaul Mullowney 
2916ca45077fSPaul Mullowney   PetscFunctionBegin;
2917e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2918ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2919ca45077fSPaul Mullowney }
2920ca45077fSPaul Mullowney 
2921a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2922a0e72f99SJunchao Zhang {
2923a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2924a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2925a0e72f99SJunchao Zhang }
2926a0e72f99SJunchao Zhang 
2927afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2928e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29299ae82921SPaul Mullowney {
29309ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2931aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
29329ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2933e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2934b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
2935aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2936e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2937e6e9a74fSStefano Zampini   PetscBool                    compressed;
2938afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2939afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2940afb2bd1cSJunchao Zhang #endif
29416e111a19SKarl Rupp 
29429ae82921SPaul Mullowney   PetscFunctionBegin;
2943e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2944e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2945afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2946d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2947e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2948e6e9a74fSStefano Zampini   }
294934d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
295034d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2951e6e9a74fSStefano Zampini   if (!trans) {
29529ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2953e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2954e6e9a74fSStefano Zampini   } else {
29551a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2956e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2957e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2958e6e9a74fSStefano Zampini     } else {
29591a2c6b5cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2960e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2961e6e9a74fSStefano Zampini     }
2962e6e9a74fSStefano Zampini   }
2963e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2964e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2965213423ffSJunchao Zhang 
2966e6e9a74fSStefano Zampini   try {
2967e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2968213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2969213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2970afb2bd1cSJunchao Zhang 
297185ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2972e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2973afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2974afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2975afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2976afb2bd1cSJunchao Zhang       */
2977e6e9a74fSStefano Zampini       xptr = xarray;
2978afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2979213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2980afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2981afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2982afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2983afb2bd1cSJunchao Zhang        */
2984afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2985afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2986afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2987afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2988afb2bd1cSJunchao Zhang       }
2989afb2bd1cSJunchao Zhang      #endif
2990e6e9a74fSStefano Zampini     } else {
2991afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2992afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2993afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2994afb2bd1cSJunchao Zhang        */
2995afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2996e6e9a74fSStefano Zampini       dptr = zarray;
2997e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2998afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2999e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3000a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3001e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3002e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3003e6e9a74fSStefano Zampini       }
3004afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3005afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3006afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3007afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3008afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3009afb2bd1cSJunchao Zhang       }
3010afb2bd1cSJunchao Zhang      #endif
3011e6e9a74fSStefano Zampini     }
30129ae82921SPaul Mullowney 
3013afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3014aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3015afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3016afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3017afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3018ee7b52eaSHong Zhang         cudaError_t cerr;
3019afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3020afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3021afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3022afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3023afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3024afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3025afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3026afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3027afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3028afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3029afb2bd1cSJunchao Zhang 
3030afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3031afb2bd1cSJunchao Zhang       } else {
3032afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3033afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3034afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3035afb2bd1cSJunchao Zhang       }
3036afb2bd1cSJunchao Zhang 
3037afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3038afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
30391a2c6b5cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
3040afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3041afb2bd1cSJunchao Zhang                                beta,
3042afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3043afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3044afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3045afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3046afb2bd1cSJunchao Zhang      #else
30477656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3048e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3049a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3050afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3051aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3052e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
305357d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3054afb2bd1cSJunchao Zhang      #endif
3055aa372e3fSPaul Mullowney     } else {
3056213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3057afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3058afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3059afb2bd1cSJunchao Zhang        #else
3060301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3061e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3062afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3063e6e9a74fSStefano Zampini                                  xptr, beta,
306457d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3065afb2bd1cSJunchao Zhang        #endif
3066a65300a6SPaul Mullowney       }
3067aa372e3fSPaul Mullowney     }
3068958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3069aa372e3fSPaul Mullowney 
3070e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3071213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3072213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3073213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3074e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3075213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
30767656d835SStefano Zampini         }
3077213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3078c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
30797656d835SStefano Zampini       }
30807656d835SStefano Zampini 
3081213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3082213423ffSJunchao Zhang       if (compressed) {
3083e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3084a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3085a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3086a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3087a0e72f99SJunchao Zhang          */
3088a0e72f99SJunchao Zhang        #if 0
3089a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3090a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3091a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3092e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3093c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3094a0e72f99SJunchao Zhang        #else
3095a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3096a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3097a0e72f99SJunchao Zhang        #endif
3098958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3099e6e9a74fSStefano Zampini       }
3100e6e9a74fSStefano Zampini     } else {
3101e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3102e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3103e6e9a74fSStefano Zampini       }
3104e6e9a74fSStefano Zampini     }
3105e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3106213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3107213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
31089ae82921SPaul Mullowney   } catch(char *ex) {
31099ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31109ae82921SPaul Mullowney   }
3111e6e9a74fSStefano Zampini   if (yy) {
3112958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3113e6e9a74fSStefano Zampini   } else {
3114e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3115e6e9a74fSStefano Zampini   }
31169ae82921SPaul Mullowney   PetscFunctionReturn(0);
31179ae82921SPaul Mullowney }
31189ae82921SPaul Mullowney 
31196fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3120ca45077fSPaul Mullowney {
3121b175d8bbSPaul Mullowney   PetscErrorCode ierr;
31226e111a19SKarl Rupp 
3123ca45077fSPaul Mullowney   PetscFunctionBegin;
3124e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3125ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3126ca45077fSPaul Mullowney }
3127ca45077fSPaul Mullowney 
31286fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31299ae82921SPaul Mullowney {
31309ae82921SPaul Mullowney   PetscErrorCode     ierr;
3131042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3132042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31333fa6b06aSMark Adams 
3134042217e8SBarry Smith   PetscFunctionBegin;
3135042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3136042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3137042217e8SBarry Smith     cudaError_t cerr;
3138042217e8SBarry Smith 
3139042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3140042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3141042217e8SBarry Smith     cusp->deviceMat = NULL;
3142042217e8SBarry Smith   }
31439ae82921SPaul Mullowney   PetscFunctionReturn(0);
31449ae82921SPaul Mullowney }
31459ae82921SPaul Mullowney 
31469ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3147e057df02SPaul Mullowney /*@
31489ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3149e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3150e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3151e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3152e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3153e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
31549ae82921SPaul Mullowney 
3155d083f849SBarry Smith    Collective
31569ae82921SPaul Mullowney 
31579ae82921SPaul Mullowney    Input Parameters:
31589ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
31599ae82921SPaul Mullowney .  m - number of rows
31609ae82921SPaul Mullowney .  n - number of columns
31619ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
31629ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
31630298fd71SBarry Smith          (possibly different for each row) or NULL
31649ae82921SPaul Mullowney 
31659ae82921SPaul Mullowney    Output Parameter:
31669ae82921SPaul Mullowney .  A - the matrix
31679ae82921SPaul Mullowney 
31689ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
31699ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
31709ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
31719ae82921SPaul Mullowney 
31729ae82921SPaul Mullowney    Notes:
31739ae82921SPaul Mullowney    If nnz is given then nz is ignored
31749ae82921SPaul Mullowney 
31759ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
31769ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
31779ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
31789ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
31799ae82921SPaul Mullowney 
31809ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
31810298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
31829ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
31839ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
31849ae82921SPaul Mullowney 
31859ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
31869ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
31879ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
31889ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
31899ae82921SPaul Mullowney 
31909ae82921SPaul Mullowney    Level: intermediate
31919ae82921SPaul Mullowney 
3192e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
31939ae82921SPaul Mullowney @*/
31949ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
31959ae82921SPaul Mullowney {
31969ae82921SPaul Mullowney   PetscErrorCode ierr;
31979ae82921SPaul Mullowney 
31989ae82921SPaul Mullowney   PetscFunctionBegin;
31999ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
32009ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
32019ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32029ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
32039ae82921SPaul Mullowney   PetscFunctionReturn(0);
32049ae82921SPaul Mullowney }
32059ae82921SPaul Mullowney 
32066fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32079ae82921SPaul Mullowney {
32089ae82921SPaul Mullowney   PetscErrorCode ierr;
3209ab25e6cbSDominic Meiser 
32109ae82921SPaul Mullowney   PetscFunctionBegin;
32119ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3212470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
32139ae82921SPaul Mullowney   } else {
3214470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3215aa372e3fSPaul Mullowney   }
3216c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3217ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3218ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3219ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3220fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3221ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
32227e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
32237e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
32249ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
32259ae82921SPaul Mullowney   PetscFunctionReturn(0);
32269ae82921SPaul Mullowney }
32279ae82921SPaul Mullowney 
3228ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
322995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32309ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32319ff858a8SKarl Rupp {
32329ff858a8SKarl Rupp   PetscErrorCode ierr;
32339ff858a8SKarl Rupp 
32349ff858a8SKarl Rupp   PetscFunctionBegin;
32359ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3236ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
32379ff858a8SKarl Rupp   PetscFunctionReturn(0);
32389ff858a8SKarl Rupp }
32399ff858a8SKarl Rupp 
3240039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
324195639643SRichard Tran Mills {
3242e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3243a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3244039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3245039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3246039c6fbaSStefano Zampini   PetscScalar        *ay;
3247039c6fbaSStefano Zampini   const PetscScalar  *ax;
3248039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3249e6e9a74fSStefano Zampini 
325095639643SRichard Tran Mills   PetscFunctionBegin;
3251a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3252a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3253039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3254a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3255a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3256a587d139SMark     PetscFunctionReturn(0);
325795639643SRichard Tran Mills   }
3258039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3259a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3260a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3261e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3262e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3263039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3264039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3265039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3266039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3267039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3268039c6fbaSStefano Zampini     if (eq) {
3269039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3270039c6fbaSStefano Zampini     }
3271039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3272039c6fbaSStefano Zampini   }
3273d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3274d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3275039c6fbaSStefano Zampini 
3276039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3277039c6fbaSStefano Zampini     cusparseStatus_t stat;
3278039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3279039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3280039c6fbaSStefano Zampini     size_t           bufferSize;
3281039c6fbaSStefano Zampini     void             *buffer;
3282ee7b52eaSHong Zhang     cudaError_t      cerr;
3283039c6fbaSStefano Zampini #endif
3284039c6fbaSStefano Zampini 
3285039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3286039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3287039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3288039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3289039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3290039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3291039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3292039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3293039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3294039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3295039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3296039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3297039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3298039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3299039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3300039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3301039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3302039c6fbaSStefano Zampini #else
3303039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3304039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3305039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3306039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3307039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3308039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3309039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3310039c6fbaSStefano Zampini #endif
3311039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3312039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3313039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3314039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3315039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3316a587d139SMark     cublasHandle_t cublasv2handle;
3317039c6fbaSStefano Zampini     cublasStatus_t berr;
3318a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3319039c6fbaSStefano Zampini 
3320039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3321039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3322a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3323a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3324a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3325039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3326a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3327a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3328039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3329039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3330a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3331039c6fbaSStefano Zampini   } else {
3332a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3333d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3334a587d139SMark   }
333595639643SRichard Tran Mills   PetscFunctionReturn(0);
333695639643SRichard Tran Mills }
333795639643SRichard Tran Mills 
333833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
333933c9ba73SStefano Zampini {
334033c9ba73SStefano Zampini   PetscErrorCode ierr;
334133c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
334233c9ba73SStefano Zampini   PetscScalar    *ay;
334333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
334433c9ba73SStefano Zampini   cublasStatus_t berr;
334533c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
334633c9ba73SStefano Zampini 
334733c9ba73SStefano Zampini   PetscFunctionBegin;
334833c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
334933c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
335033c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
335133c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
335233c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
335333c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
335433c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
335533c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
335633c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
335733c9ba73SStefano Zampini   PetscFunctionReturn(0);
335833c9ba73SStefano Zampini }
335933c9ba73SStefano Zampini 
33603fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
33613fa6b06aSMark Adams {
33623fa6b06aSMark Adams   PetscErrorCode ierr;
33637e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3364a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
33657e8381f9SStefano Zampini 
33663fa6b06aSMark Adams   PetscFunctionBegin;
33673fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
33683fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
33697e8381f9SStefano Zampini     if (spptr->mat) {
33707e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
33717e8381f9SStefano Zampini       if (matrix->values) {
33727e8381f9SStefano Zampini         both = PETSC_TRUE;
33737e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
33747e8381f9SStefano Zampini       }
33757e8381f9SStefano Zampini     }
33767e8381f9SStefano Zampini     if (spptr->matTranspose) {
33777e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
33787e8381f9SStefano Zampini       if (matrix->values) {
33797e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
33807e8381f9SStefano Zampini       }
33817e8381f9SStefano Zampini     }
33823fa6b06aSMark Adams   }
3383a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3384a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3385a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
33867e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3387a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
33883fa6b06aSMark Adams   PetscFunctionReturn(0);
33893fa6b06aSMark Adams }
33903fa6b06aSMark Adams 
3391a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3392a587d139SMark {
3393a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3394a587d139SMark   PetscErrorCode ierr;
3395a587d139SMark 
3396a587d139SMark   PetscFunctionBegin;
3397a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3398a587d139SMark   if (flg) {
3399a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3400a587d139SMark 
340133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3402a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3403a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3404a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3405a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3406a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3407a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3408a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3409a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3410fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3411c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3412a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3413a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3414a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3415a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3416a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3417fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3418a587d139SMark   } else {
341933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3420a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3421a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3422a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3423a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3424a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3425a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3426a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3427a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3428fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3429c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3430a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3431a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3432a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3433a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3434a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3435fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3436a587d139SMark   }
3437a587d139SMark   A->boundtocpu = flg;
3438a587d139SMark   a->inode.use = flg;
3439a587d139SMark   PetscFunctionReturn(0);
3440a587d139SMark }
3441a587d139SMark 
344249735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
34439ae82921SPaul Mullowney {
34449ae82921SPaul Mullowney   PetscErrorCode   ierr;
3445aa372e3fSPaul Mullowney   cusparseStatus_t stat;
344649735bf3SStefano Zampini   Mat              B;
34479ae82921SPaul Mullowney 
34489ae82921SPaul Mullowney   PetscFunctionBegin;
3449832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
345049735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
345149735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
345249735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
345349735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
345449735bf3SStefano Zampini   }
345549735bf3SStefano Zampini   B = *newmat;
345649735bf3SStefano Zampini 
345734136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
345834136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
345934136279SStefano Zampini 
346049735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
34619ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3462e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3463e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3464e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3465a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
34661a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3467d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3468d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3469d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3470d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3471d8132acaSStefano Zampini      #endif
34721a2c6b5cSJunchao Zhang       B->spptr = spptr;
34739ae82921SPaul Mullowney     } else {
3474e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3475e6e9a74fSStefano Zampini 
3476e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3477e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3478a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3479e6e9a74fSStefano Zampini       B->spptr = spptr;
34809ae82921SPaul Mullowney     }
3481e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
348249735bf3SStefano Zampini   }
3483693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
34849ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
34851a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
34869ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
348795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3488693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
34892205254eSKarl Rupp 
3490e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
34919ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3492bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
34939ae82921SPaul Mullowney   PetscFunctionReturn(0);
34949ae82921SPaul Mullowney }
34959ae82921SPaul Mullowney 
349602fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
349702fe1965SBarry Smith {
349802fe1965SBarry Smith   PetscErrorCode ierr;
349902fe1965SBarry Smith 
350002fe1965SBarry Smith   PetscFunctionBegin;
350102fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
35020ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
350302fe1965SBarry Smith   PetscFunctionReturn(0);
350402fe1965SBarry Smith }
350502fe1965SBarry Smith 
35063ca39a21SBarry Smith /*MC
3507e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3508e057df02SPaul Mullowney 
3509e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35102692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35112692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3512e057df02SPaul Mullowney 
3513e057df02SPaul Mullowney    Options Database Keys:
3514e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3515aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3516a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3517e057df02SPaul Mullowney 
3518e057df02SPaul Mullowney   Level: beginner
3519e057df02SPaul Mullowney 
35208468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3521e057df02SPaul Mullowney M*/
35227f756511SDominic Meiser 
3523bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35240f39cd5aSBarry Smith 
35253ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
352642c9c57cSBarry Smith {
352742c9c57cSBarry Smith   PetscErrorCode ierr;
352842c9c57cSBarry Smith 
352942c9c57cSBarry Smith   PetscFunctionBegin;
3530bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
35313ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35323ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35333ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35343ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3535bddcd29dSMark Adams 
353642c9c57cSBarry Smith   PetscFunctionReturn(0);
353742c9c57cSBarry Smith }
353829b38603SBarry Smith 
3539470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
35407f756511SDominic Meiser {
3541e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
35427f756511SDominic Meiser   cusparseStatus_t stat;
35437f756511SDominic Meiser 
35447f756511SDominic Meiser   PetscFunctionBegin;
35457f756511SDominic Meiser   if (*cusparsestruct) {
3546e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3547e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
35487f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
354981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
35507e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
35517e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3552a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
35537e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3554e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
35557f756511SDominic Meiser   }
35567f756511SDominic Meiser   PetscFunctionReturn(0);
35577f756511SDominic Meiser }
35587f756511SDominic Meiser 
35597f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
35607f756511SDominic Meiser {
35617f756511SDominic Meiser   PetscFunctionBegin;
35627f756511SDominic Meiser   if (*mat) {
35637f756511SDominic Meiser     delete (*mat)->values;
35647f756511SDominic Meiser     delete (*mat)->column_indices;
35657f756511SDominic Meiser     delete (*mat)->row_offsets;
35667f756511SDominic Meiser     delete *mat;
35677f756511SDominic Meiser     *mat = 0;
35687f756511SDominic Meiser   }
35697f756511SDominic Meiser   PetscFunctionReturn(0);
35707f756511SDominic Meiser }
35717f756511SDominic Meiser 
3572470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
35737f756511SDominic Meiser {
35747f756511SDominic Meiser   cusparseStatus_t stat;
35757f756511SDominic Meiser   PetscErrorCode   ierr;
35767f756511SDominic Meiser 
35777f756511SDominic Meiser   PetscFunctionBegin;
35787f756511SDominic Meiser   if (*trifactor) {
357957d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3580afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
35817f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
35821b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
35832cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3584afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
35851b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3586afb2bd1cSJunchao Zhang    #endif
3587da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
35887f756511SDominic Meiser   }
35897f756511SDominic Meiser   PetscFunctionReturn(0);
35907f756511SDominic Meiser }
35917f756511SDominic Meiser 
3592470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
35937f756511SDominic Meiser {
35947f756511SDominic Meiser   CsrMatrix        *mat;
35957f756511SDominic Meiser   cusparseStatus_t stat;
35967f756511SDominic Meiser   cudaError_t      err;
35977f756511SDominic Meiser 
35987f756511SDominic Meiser   PetscFunctionBegin;
35997f756511SDominic Meiser   if (*matstruct) {
36007f756511SDominic Meiser     if ((*matstruct)->mat) {
36017f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3602afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3603afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3604afb2bd1cSJunchao Zhang        #else
36057f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
360657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3607afb2bd1cSJunchao Zhang        #endif
36087f756511SDominic Meiser       } else {
36097f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36107f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36117f756511SDominic Meiser       }
36127f756511SDominic Meiser     }
361357d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
36147f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3615afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
36167656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
36177656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3618afb2bd1cSJunchao Zhang 
3619afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3620afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3621afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3622afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3623afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3624afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3625afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3626afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3627afb2bd1cSJunchao Zhang       }
3628afb2bd1cSJunchao Zhang     }
3629afb2bd1cSJunchao Zhang    #endif
36307f756511SDominic Meiser     delete *matstruct;
36317e8381f9SStefano Zampini     *matstruct = NULL;
36327f756511SDominic Meiser   }
36337f756511SDominic Meiser   PetscFunctionReturn(0);
36347f756511SDominic Meiser }
36357f756511SDominic Meiser 
3636e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
36377f756511SDominic Meiser {
3638e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3639e6e9a74fSStefano Zampini 
36407f756511SDominic Meiser   PetscFunctionBegin;
36417f756511SDominic Meiser   if (*trifactors) {
3642e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3643e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3644e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3645e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
36467f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
36477f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
36487f756511SDominic Meiser     delete (*trifactors)->workVector;
36497e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
36507e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
36517e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3652bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3653bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3654e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3655ccdfe979SStefano Zampini   }
3656ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3657ccdfe979SStefano Zampini }
3658ccdfe979SStefano Zampini 
3659ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3660ccdfe979SStefano Zampini {
3661e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3662ccdfe979SStefano Zampini   cusparseHandle_t handle;
3663ccdfe979SStefano Zampini   cusparseStatus_t stat;
3664ccdfe979SStefano Zampini 
3665ccdfe979SStefano Zampini   PetscFunctionBegin;
3666ccdfe979SStefano Zampini   if (*trifactors) {
3667e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
36687f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
366957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
36707f756511SDominic Meiser     }
3671e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
36727f756511SDominic Meiser   }
36737f756511SDominic Meiser   PetscFunctionReturn(0);
36747f756511SDominic Meiser }
36757e8381f9SStefano Zampini 
36767e8381f9SStefano Zampini struct IJCompare
36777e8381f9SStefano Zampini {
36787e8381f9SStefano Zampini   __host__ __device__
36797e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
36807e8381f9SStefano Zampini   {
36817e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
36827e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
36837e8381f9SStefano Zampini     return false;
36847e8381f9SStefano Zampini   }
36857e8381f9SStefano Zampini };
36867e8381f9SStefano Zampini 
36877e8381f9SStefano Zampini struct IJEqual
36887e8381f9SStefano Zampini {
36897e8381f9SStefano Zampini   __host__ __device__
36907e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
36917e8381f9SStefano Zampini   {
36927e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
36937e8381f9SStefano Zampini     return true;
36947e8381f9SStefano Zampini   }
36957e8381f9SStefano Zampini };
36967e8381f9SStefano Zampini 
36977e8381f9SStefano Zampini struct IJDiff
36987e8381f9SStefano Zampini {
36997e8381f9SStefano Zampini   __host__ __device__
37007e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37017e8381f9SStefano Zampini   {
37027e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37037e8381f9SStefano Zampini   }
37047e8381f9SStefano Zampini };
37057e8381f9SStefano Zampini 
37067e8381f9SStefano Zampini struct IJSum
37077e8381f9SStefano Zampini {
37087e8381f9SStefano Zampini   __host__ __device__
37097e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37107e8381f9SStefano Zampini   {
37117e8381f9SStefano Zampini     return t1||t2;
37127e8381f9SStefano Zampini   }
37137e8381f9SStefano Zampini };
37147e8381f9SStefano Zampini 
37157e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3716e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
37177e8381f9SStefano Zampini {
37187e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3719fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3720bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
372108391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37227e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37237e8381f9SStefano Zampini   PetscErrorCode                        ierr;
37247e8381f9SStefano Zampini   PetscInt                              n;
37257e8381f9SStefano Zampini 
37267e8381f9SStefano Zampini   PetscFunctionBegin;
37277e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
37287e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
37297e8381f9SStefano Zampini   if (!cusp->cooPerm) {
37307e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
37317e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
37327e8381f9SStefano Zampini     PetscFunctionReturn(0);
37337e8381f9SStefano Zampini   }
37347e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
37357e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3736e61fc153SStefano Zampini   if (!v) {
3737e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3738e61fc153SStefano Zampini     goto finalize;
37397e8381f9SStefano Zampini   }
3740e61fc153SStefano Zampini   n = cusp->cooPerm->size();
374108391a17SStefano Zampini   if (isCudaMem(v)) {
374208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
374308391a17SStefano Zampini   } else {
3744e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3745e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
374608391a17SStefano Zampini     d_v = cooPerm_v->data();
3747e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
374808391a17SStefano Zampini   }
3749bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3750e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3751ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3752bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
375308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3754ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3755ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3756ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3757ddea5d60SJunchao Zhang       */
3758e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3759e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3760e61fc153SStefano Zampini       delete cooPerm_w;
37617e8381f9SStefano Zampini     } else {
3762ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
376308391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
37647e8381f9SStefano Zampini                                                                 matrix->values->begin()));
376508391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
37667e8381f9SStefano Zampini                                                                 matrix->values->end()));
3767ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
37687e8381f9SStefano Zampini     }
37697e8381f9SStefano Zampini   } else {
3770e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
377108391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3772e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
37737e8381f9SStefano Zampini     } else {
377408391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
37757e8381f9SStefano Zampini                                                                 matrix->values->begin()));
377608391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
37777e8381f9SStefano Zampini                                                                 matrix->values->end()));
37787e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
37797e8381f9SStefano Zampini     }
37807e8381f9SStefano Zampini   }
3781bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3782e61fc153SStefano Zampini finalize:
3783e61fc153SStefano Zampini   delete cooPerm_v;
37847e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3785e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3786fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3787fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3788fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3789fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3790fcdce8c4SStefano Zampini   a->reallocs         = 0;
3791fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3792fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3793fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3794fcdce8c4SStefano Zampini   A->num_ass++;
37957e8381f9SStefano Zampini   PetscFunctionReturn(0);
37967e8381f9SStefano Zampini }
37977e8381f9SStefano Zampini 
3798a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3799a49f1ed0SStefano Zampini {
3800a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3801a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3802a49f1ed0SStefano Zampini 
3803a49f1ed0SStefano Zampini   PetscFunctionBegin;
3804a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3805a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3806a49f1ed0SStefano Zampini   if (destroy) {
3807a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3808a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3809a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3810a49f1ed0SStefano Zampini   }
38111a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3812a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3813a49f1ed0SStefano Zampini }
3814a49f1ed0SStefano Zampini 
38157e8381f9SStefano Zampini #include <thrust/binary_search.h>
3816e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
38177e8381f9SStefano Zampini {
38187e8381f9SStefano Zampini   PetscErrorCode     ierr;
38197e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38207e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38217e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38227e8381f9SStefano Zampini   cudaError_t        cerr;
38237e8381f9SStefano Zampini 
38247e8381f9SStefano Zampini   PetscFunctionBegin;
38257e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
38267e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
38277e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
38287e8381f9SStefano Zampini   if (n != cooPerm_n) {
38297e8381f9SStefano Zampini     delete cusp->cooPerm;
38307e8381f9SStefano Zampini     delete cusp->cooPerm_a;
38317e8381f9SStefano Zampini     cusp->cooPerm = NULL;
38327e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
38337e8381f9SStefano Zampini   }
38347e8381f9SStefano Zampini   if (n) {
38357e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
38367e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
38377e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
38387e8381f9SStefano Zampini 
38397e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
38407e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
38417e8381f9SStefano Zampini 
38427e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
38437e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
38447e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3845ddea5d60SJunchao Zhang 
3846ddea5d60SJunchao Zhang     /* Ex.
3847ddea5d60SJunchao Zhang       n = 6
3848ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3849ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3850ddea5d60SJunchao Zhang     */
38517e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
38527e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
38537e8381f9SStefano Zampini 
385408391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
38557e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3856ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3857ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
38587e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
38597e8381f9SStefano Zampini 
3860ddea5d60SJunchao Zhang     /*
3861ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3862ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3863ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3864ddea5d60SJunchao Zhang     */
3865ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3866ddea5d60SJunchao Zhang 
3867ddea5d60SJunchao Zhang     /*
3868ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3869ddea5d60SJunchao Zhang                             ^ekey
3870ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3871ddea5d60SJunchao Zhang                            ^nekye
3872ddea5d60SJunchao Zhang     */
38737e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
38747e8381f9SStefano Zampini       delete cusp->cooPerm_a;
38757e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3876ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3877ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3878ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3879ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3880ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
38817e8381f9SStefano Zampini       w[0] = 0;
3882ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3883ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
38847e8381f9SStefano Zampini     }
38857e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3886ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3887ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3888ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
388908391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
38907e8381f9SStefano Zampini 
38917e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
38927e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
38937e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
38947e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
38957e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3896ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
38977e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
38987e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3899fcdce8c4SStefano Zampini     a->rmax = 0;
39007e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
39017e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
39027e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
39037e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
39047e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
39057e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39067e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39077e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39087e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3909fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39107e8381f9SStefano Zampini     }
3911fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39127e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39137e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3914fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
39157e8381f9SStefano Zampini   } else {
39167e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
39177e8381f9SStefano Zampini   }
3918e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
39197e8381f9SStefano Zampini 
39207e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3921e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3922e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
39237e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
39247e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39257e8381f9SStefano Zampini   A->nonzerostate++;
39267e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3927a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
39287e8381f9SStefano Zampini 
39297e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
39307e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
39317e8381f9SStefano Zampini   PetscFunctionReturn(0);
39327e8381f9SStefano Zampini }
3933ed502f03SStefano Zampini 
3934*5f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3935*5f101d05SStefano Zampini {
3936*5f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3937*5f101d05SStefano Zampini   CsrMatrix          *csr;
3938*5f101d05SStefano Zampini   PetscErrorCode     ierr;
3939*5f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
3940*5f101d05SStefano Zampini 
3941*5f101d05SStefano Zampini   PetscFunctionBegin;
3942*5f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3943*5f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
3944*5f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3945*5f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3946*5f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3947*5f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3948*5f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3949*5f101d05SStefano Zampini   if (i) {
3950*5f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
3951*5f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
3952*5f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3953*5f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3954*5f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3955*5f101d05SStefano Zampini       }
3956*5f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
3957*5f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
3958*5f101d05SStefano Zampini   }
3959*5f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
3960*5f101d05SStefano Zampini   PetscFunctionReturn(0);
3961*5f101d05SStefano Zampini }
3962*5f101d05SStefano Zampini 
3963*5f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
3964*5f101d05SStefano Zampini {
3965*5f101d05SStefano Zampini   PetscFunctionBegin;
3966*5f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3967*5f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3968*5f101d05SStefano Zampini   if (i) *i = NULL;
3969*5f101d05SStefano Zampini   if (j) *j = NULL;
3970*5f101d05SStefano Zampini   PetscFunctionReturn(0);
3971*5f101d05SStefano Zampini }
3972*5f101d05SStefano Zampini 
3973ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3974ed502f03SStefano Zampini {
3975ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3976ed502f03SStefano Zampini   CsrMatrix          *csr;
3977ed502f03SStefano Zampini   PetscErrorCode     ierr;
3978ed502f03SStefano Zampini 
3979ed502f03SStefano Zampini   PetscFunctionBegin;
3980ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3981ed502f03SStefano Zampini   PetscValidPointer(a,2);
3982ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3983ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3984ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
398533c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3986ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3987ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3988ed502f03SStefano Zampini   *a = csr->values->data().get();
3989ed502f03SStefano Zampini   PetscFunctionReturn(0);
3990ed502f03SStefano Zampini }
3991ed502f03SStefano Zampini 
3992ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3993ed502f03SStefano Zampini {
3994ed502f03SStefano Zampini   PetscFunctionBegin;
3995ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3996ed502f03SStefano Zampini   PetscValidPointer(a,2);
3997ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3998ed502f03SStefano Zampini   *a = NULL;
3999ed502f03SStefano Zampini   PetscFunctionReturn(0);
4000ed502f03SStefano Zampini }
4001ed502f03SStefano Zampini 
4002039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4003039c6fbaSStefano Zampini {
4004039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4005039c6fbaSStefano Zampini   CsrMatrix          *csr;
4006039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4007039c6fbaSStefano Zampini 
4008039c6fbaSStefano Zampini   PetscFunctionBegin;
4009039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4010039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4011039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4012039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4013039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
401433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4015039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4016039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4017039c6fbaSStefano Zampini   *a = csr->values->data().get();
4018039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4019a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4020039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4021039c6fbaSStefano Zampini }
4022039c6fbaSStefano Zampini 
4023039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4024039c6fbaSStefano Zampini {
4025039c6fbaSStefano Zampini   PetscErrorCode ierr;
4026039c6fbaSStefano Zampini 
4027039c6fbaSStefano Zampini   PetscFunctionBegin;
4028039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4029039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4030039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4031039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4032039c6fbaSStefano Zampini   *a = NULL;
4033039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4034039c6fbaSStefano Zampini }
4035039c6fbaSStefano Zampini 
4036ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4037ed502f03SStefano Zampini {
4038ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4039ed502f03SStefano Zampini   CsrMatrix          *csr;
4040a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4041ed502f03SStefano Zampini 
4042ed502f03SStefano Zampini   PetscFunctionBegin;
4043ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4044ed502f03SStefano Zampini   PetscValidPointer(a,2);
4045ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4046ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
404733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4048ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4049ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4050ed502f03SStefano Zampini   *a = csr->values->data().get();
4051039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4052a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4053ed502f03SStefano Zampini   PetscFunctionReturn(0);
4054ed502f03SStefano Zampini }
4055ed502f03SStefano Zampini 
4056ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4057ed502f03SStefano Zampini {
4058ed502f03SStefano Zampini   PetscErrorCode ierr;
4059ed502f03SStefano Zampini 
4060ed502f03SStefano Zampini   PetscFunctionBegin;
4061ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4062ed502f03SStefano Zampini   PetscValidPointer(a,2);
4063ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4064ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4065ed502f03SStefano Zampini   *a = NULL;
4066ed502f03SStefano Zampini   PetscFunctionReturn(0);
4067ed502f03SStefano Zampini }
4068ed502f03SStefano Zampini 
4069ed502f03SStefano Zampini struct IJCompare4
4070ed502f03SStefano Zampini {
4071ed502f03SStefano Zampini   __host__ __device__
40722ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4073ed502f03SStefano Zampini   {
4074ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4075ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4076ed502f03SStefano Zampini     return false;
4077ed502f03SStefano Zampini   }
4078ed502f03SStefano Zampini };
4079ed502f03SStefano Zampini 
40808909a122SStefano Zampini struct Shift
40818909a122SStefano Zampini {
4082ed502f03SStefano Zampini   int _shift;
4083ed502f03SStefano Zampini 
4084ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4085ed502f03SStefano Zampini   __host__ __device__
4086ed502f03SStefano Zampini   inline int operator() (const int &c)
4087ed502f03SStefano Zampini   {
4088ed502f03SStefano Zampini     return c + _shift;
4089ed502f03SStefano Zampini   }
4090ed502f03SStefano Zampini };
4091ed502f03SStefano Zampini 
4092ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4093ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4094ed502f03SStefano Zampini {
4095ed502f03SStefano Zampini   PetscErrorCode               ierr;
4096ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4097ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4098ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4099ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4100ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4101ed502f03SStefano Zampini   cusparseStatus_t             stat;
4102ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4103ed502f03SStefano Zampini   cudaError_t                  cerr;
4104ed502f03SStefano Zampini 
4105ed502f03SStefano Zampini   PetscFunctionBegin;
4106ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4107ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4108ed502f03SStefano Zampini   PetscValidPointer(C,4);
4109ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4110ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4111ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4112ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4113ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4114ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4115ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4116ed502f03SStefano Zampini     m     = A->rmap->n;
4117ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4118ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4119ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4120ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4121ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4122ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4123ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4124ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4125ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4126ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4127ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4128ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4129ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4130ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4131ed502f03SStefano Zampini     Ccusp->nrows    = m;
4132ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4133ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4134ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4135ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4136ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4137ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4138ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4139ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4140ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4141ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4142ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4143ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4144ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4145ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4146ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
41471a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
41481a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
4149ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4150ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4151ed502f03SStefano Zampini 
4152ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4153ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4154ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4155ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4156ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4157ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4158ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4159ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4160ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4161ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4162ed502f03SStefano Zampini     if (c->nz) {
41632ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
41642ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
41652ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
41662ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
41672ed87e7eSStefano Zampini 
4168ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4169ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4170ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4171ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4172ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4173ed502f03SStefano Zampini         }
41742ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
41752ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4176ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4177ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4178ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4179ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4180ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4181ed502f03SStefano Zampini         }
41822ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
41832ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4184ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
41852ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
41862ed87e7eSStefano Zampini                               Aroff->data().get(),
41872ed87e7eSStefano Zampini                               Annz,
41882ed87e7eSStefano Zampini                               m,
41892ed87e7eSStefano Zampini                               Acoo->data().get(),
41902ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4191ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
41922ed87e7eSStefano Zampini                               Broff->data().get(),
4193ed502f03SStefano Zampini                               Bnnz,
4194ed502f03SStefano Zampini                               m,
41952ed87e7eSStefano Zampini                               Bcoo->data().get(),
4196ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
41972ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
41982ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
41992ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
42008909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4201ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4202ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
42038909a122SStefano Zampini #else
42048909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
42058909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
42068909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
42078909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
42088909a122SStefano Zampini #endif
42092ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
42102ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
42112ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
42122ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
42132ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
42142ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4215ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4216ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4217ed502f03SStefano Zampini       thrust::advance(p2,Annz);
42182ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
42198909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
42208909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
42218909a122SStefano Zampini #endif
42222ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
42232ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
42242ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
42252ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
42262ed87e7eSStefano Zampini #else
42272ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
42282ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
42292ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
42302ed87e7eSStefano Zampini #endif
4231ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
42322ed87e7eSStefano Zampini                               Ccoo->data().get(),
4233ed502f03SStefano Zampini                               c->nz,
4234ed502f03SStefano Zampini                               m,
4235ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4236ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4237ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
42382ed87e7eSStefano Zampini       delete wPerm;
42392ed87e7eSStefano Zampini       delete Acoo;
42402ed87e7eSStefano Zampini       delete Bcoo;
42412ed87e7eSStefano Zampini       delete Ccoo;
4242ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4243ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4244ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4245ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4246ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4247ed502f03SStefano Zampini #endif
42481a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4249ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4250ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4251ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4252ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4253ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4254ed502f03SStefano Zampini 
42551a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
42561a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4257a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4258ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4259ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4260ed502f03SStefano Zampini         CcsrT->num_rows = n;
4261ed502f03SStefano Zampini         CcsrT->num_cols = m;
4262ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4263ed502f03SStefano Zampini 
4264ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4265ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4266ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4267ed502f03SStefano Zampini 
4268ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4269ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4270ed502f03SStefano Zampini         if (AT) {
4271ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4272ed502f03SStefano Zampini           thrust::advance(rT,-1);
4273ed502f03SStefano Zampini         }
4274ed502f03SStefano Zampini         if (BT) {
4275ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4276ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4277ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4278ed502f03SStefano Zampini         }
4279ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4280ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4281ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4282ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4283ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4284ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4285ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4286ed502f03SStefano Zampini 
4287ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4288ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4289ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4290ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4291ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4292ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4293ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4294ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4295ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4296ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4297ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4298ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4299ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4300ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4301ed502f03SStefano Zampini #endif
4302ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4303ed502f03SStefano Zampini       }
4304ed502f03SStefano Zampini     }
4305ed502f03SStefano Zampini 
4306ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4307ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4308ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4309ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4310ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4311ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4312ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4313ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4314ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4315ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4316ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4317ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4318ed502f03SStefano Zampini     } else {
4319ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4320ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4321ed502f03SStefano Zampini     }
4322ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4323ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4324ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4325ed502f03SStefano Zampini     c->maxnz = c->nz;
4326ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4327ed502f03SStefano Zampini     c->rmax = 0;
4328ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4329ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4330ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4331ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4332ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4333ed502f03SStefano Zampini     }
4334ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4335ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4336ed502f03SStefano Zampini     (*C)->nonzerostate++;
4337ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4338ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4339ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4340ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4341ed502f03SStefano Zampini   } else {
4342ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4343ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4344ed502f03SStefano Zampini     if (c->nz) {
4345ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4346ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4347ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4348ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4349ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4350ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4351ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4352ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4353ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4354ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4355ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4356ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4357ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4358ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4359ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4360ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4361ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4362ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4363ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4364ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4365ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4366ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4367ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4368ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4369ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4370ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4371ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4372ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4373ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4374a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
43751a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4376ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4377ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4378ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4379ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4380ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4381ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4382ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4383ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
43841a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4385ed502f03SStefano Zampini       }
4386ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4387ed502f03SStefano Zampini     }
4388ed502f03SStefano Zampini   }
4389ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4390ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4391ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4392ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4393ed502f03SStefano Zampini   PetscFunctionReturn(0);
4394ed502f03SStefano Zampini }
4395c215019aSStefano Zampini 
4396c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4397c215019aSStefano Zampini {
4398c215019aSStefano Zampini   PetscErrorCode    ierr;
4399c215019aSStefano Zampini   bool              dmem;
4400c215019aSStefano Zampini   const PetscScalar *av;
4401c215019aSStefano Zampini   cudaError_t       cerr;
4402c215019aSStefano Zampini 
4403c215019aSStefano Zampini   PetscFunctionBegin;
4404c215019aSStefano Zampini   dmem = isCudaMem(v);
4405c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4406c215019aSStefano Zampini   if (n && idx) {
4407c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4408c215019aSStefano Zampini     widx.assign(idx,idx+n);
4409c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4410c215019aSStefano Zampini 
4411c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4412c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4413c215019aSStefano Zampini     if (dmem) {
4414c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4415c215019aSStefano Zampini     } else {
4416c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4417c215019aSStefano Zampini       dv = w->data();
4418c215019aSStefano Zampini     }
4419c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4420c215019aSStefano Zampini 
4421c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4422c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4423c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4424c215019aSStefano Zampini     if (w) {
4425c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4426c215019aSStefano Zampini     }
4427c215019aSStefano Zampini     delete w;
4428c215019aSStefano Zampini   } else {
4429c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4430c215019aSStefano Zampini   }
4431c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4432c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4433c215019aSStefano Zampini   PetscFunctionReturn(0);
4434c215019aSStefano Zampini }
4435