xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision ae37ee3111c66ae795c44747338c8a36993a93d2)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
16e8d2b73aSMark Adams 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
83042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
85a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8657181aedSStefano Zampini 
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
897e8381f9SStefano Zampini 
90c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
91c215019aSStefano Zampini 
92b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
93b06137fdSPaul Mullowney {
94b06137fdSPaul Mullowney   cusparseStatus_t   stat;
95b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney   PetscFunctionBegin;
98d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
99b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10057d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
101b06137fdSPaul Mullowney   PetscFunctionReturn(0);
102b06137fdSPaul Mullowney }
103b06137fdSPaul Mullowney 
104b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
105b06137fdSPaul Mullowney {
106b06137fdSPaul Mullowney   cusparseStatus_t   stat;
107b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney   PetscFunctionBegin;
110d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1116b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11216a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11357d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11416a2e217SAlejandro Lamas Daviña     }
115b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1166b1cf21dSAlejandro Lamas Daviña   }
11757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
118b06137fdSPaul Mullowney   PetscFunctionReturn(0);
119b06137fdSPaul Mullowney }
120b06137fdSPaul Mullowney 
121b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
122b06137fdSPaul Mullowney {
123b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1247e8381f9SStefano Zampini   PetscBool          flg;
1257e8381f9SStefano Zampini   PetscErrorCode     ierr;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1287e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
1569ae82921SPaul Mullowney   PetscErrorCode ierr;
157bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1589ae82921SPaul Mullowney 
1599ae82921SPaul Mullowney   PetscFunctionBegin;
160bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
161bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1622c7c0729SBarry Smith   (*B)->factortype = ftype;
1639ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642205254eSKarl Rupp 
165087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16633d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1679ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1689ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1694ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1704ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1714ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
172087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1754ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1779ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
178bc3f50f2SPaul Mullowney 
179fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1804ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1813ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1829ae82921SPaul Mullowney   PetscFunctionReturn(0);
1839ae82921SPaul Mullowney }
1849ae82921SPaul Mullowney 
185bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
186ca45077fSPaul Mullowney {
187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1886e111a19SKarl Rupp 
189ca45077fSPaul Mullowney   PetscFunctionBegin;
190ca45077fSPaul Mullowney   switch (op) {
191e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
192aa372e3fSPaul Mullowney     cusparsestruct->format = format;
193ca45077fSPaul Mullowney     break;
194e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
195aa372e3fSPaul Mullowney     cusparsestruct->format = format;
196ca45077fSPaul Mullowney     break;
197ca45077fSPaul Mullowney   default:
19836d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
199ca45077fSPaul Mullowney   }
200ca45077fSPaul Mullowney   PetscFunctionReturn(0);
201ca45077fSPaul Mullowney }
2029ae82921SPaul Mullowney 
203e057df02SPaul Mullowney /*@
204e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
205e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
206aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
207e057df02SPaul Mullowney    Not Collective
208e057df02SPaul Mullowney 
209e057df02SPaul Mullowney    Input Parameters:
2108468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
21136d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2122692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
213e057df02SPaul Mullowney 
214e057df02SPaul Mullowney    Output Parameter:
215e057df02SPaul Mullowney 
216e057df02SPaul Mullowney    Level: intermediate
217e057df02SPaul Mullowney 
2188468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
219e057df02SPaul Mullowney @*/
220e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
221e057df02SPaul Mullowney {
222e057df02SPaul Mullowney   PetscErrorCode ierr;
2236e111a19SKarl Rupp 
224e057df02SPaul Mullowney   PetscFunctionBegin;
225e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
226e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
227e057df02SPaul Mullowney   PetscFunctionReturn(0);
228e057df02SPaul Mullowney }
229e057df02SPaul Mullowney 
2301a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
231e6e9a74fSStefano Zampini {
232e6e9a74fSStefano Zampini   PetscErrorCode ierr;
233e6e9a74fSStefano Zampini 
234e6e9a74fSStefano Zampini   PetscFunctionBegin;
2351a2c6b5cSJunchao Zhang   switch (op) {
2361a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2371a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2381a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2391a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2401a2c6b5cSJunchao Zhang       break;
2411a2c6b5cSJunchao Zhang     default:
2421a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2431a2c6b5cSJunchao Zhang       break;
244e6e9a74fSStefano Zampini   }
245e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
246e6e9a74fSStefano Zampini }
247e6e9a74fSStefano Zampini 
248bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
249bddcd29dSMark Adams 
250bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
251bddcd29dSMark Adams {
252bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
253bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
254bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
255bddcd29dSMark Adams   PetscErrorCode ierr;
256bddcd29dSMark Adams 
257bddcd29dSMark Adams   PetscFunctionBegin;
258bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
259bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
260bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
261bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
262bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
263bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
264bddcd29dSMark Adams   if (row_identity && col_identity) {
265bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
266bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
267bddcd29dSMark Adams     B->ops->matsolve = NULL;
268bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
269bddcd29dSMark Adams   } else {
270bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
271bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
272bddcd29dSMark Adams     B->ops->matsolve = NULL;
273bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
274bddcd29dSMark Adams   }
275bddcd29dSMark Adams 
276bddcd29dSMark Adams   /* get the triangular factors */
277bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
278bddcd29dSMark Adams   PetscFunctionReturn(0);
279bddcd29dSMark Adams }
280bddcd29dSMark Adams 
2814416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2829ae82921SPaul Mullowney {
2839ae82921SPaul Mullowney   PetscErrorCode           ierr;
284e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2859ae82921SPaul Mullowney   PetscBool                flg;
286a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2876e111a19SKarl Rupp 
2889ae82921SPaul Mullowney   PetscFunctionBegin;
289e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2909ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
291e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
292a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
293afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
294afb2bd1cSJunchao Zhang 
2954c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
296a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
297afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
298afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
299afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
300afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
301afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
302a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
303a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304a435da06SStefano Zampini #else
305afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
306a435da06SStefano Zampini #endif
307afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
308afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
309afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
310afb2bd1cSJunchao Zhang 
311afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
312afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
313afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
314afb2bd1cSJunchao Zhang    #endif
3154c87dfd4SPaul Mullowney   }
3160af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3179ae82921SPaul Mullowney   PetscFunctionReturn(0);
3189ae82921SPaul Mullowney }
3199ae82921SPaul Mullowney 
3206fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3219ae82921SPaul Mullowney {
322da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3239ae82921SPaul Mullowney   PetscErrorCode               ierr;
3249ae82921SPaul Mullowney 
3259ae82921SPaul Mullowney   PetscFunctionBegin;
326da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3279ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3289ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3299ae82921SPaul Mullowney   PetscFunctionReturn(0);
3309ae82921SPaul Mullowney }
3319ae82921SPaul Mullowney 
3326fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3339ae82921SPaul Mullowney {
334da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3359ae82921SPaul Mullowney   PetscErrorCode               ierr;
3369ae82921SPaul Mullowney 
3379ae82921SPaul Mullowney   PetscFunctionBegin;
338da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3399ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3409ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3419ae82921SPaul Mullowney   PetscFunctionReturn(0);
3429ae82921SPaul Mullowney }
3439ae82921SPaul Mullowney 
344087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
345087f3262SPaul Mullowney {
346da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
347087f3262SPaul Mullowney   PetscErrorCode               ierr;
348087f3262SPaul Mullowney 
349087f3262SPaul Mullowney   PetscFunctionBegin;
350da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
351087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
352087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
353087f3262SPaul Mullowney   PetscFunctionReturn(0);
354087f3262SPaul Mullowney }
355087f3262SPaul Mullowney 
356087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
357087f3262SPaul Mullowney {
358da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
359087f3262SPaul Mullowney   PetscErrorCode               ierr;
360087f3262SPaul Mullowney 
361087f3262SPaul Mullowney   PetscFunctionBegin;
362da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
363087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
364087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
365087f3262SPaul Mullowney   PetscFunctionReturn(0);
366087f3262SPaul Mullowney }
367087f3262SPaul Mullowney 
368087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3699ae82921SPaul Mullowney {
3709ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3719ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3729ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
373aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3749ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3759ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3769ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3779ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3789ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
379b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
38057d48284SJunchao Zhang   cudaError_t                       cerr;
3819ae82921SPaul Mullowney 
3829ae82921SPaul Mullowney   PetscFunctionBegin;
383cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
384c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3859ae82921SPaul Mullowney     try {
3869ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3879ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
388da79fbbcSStefano Zampini       if (!loTriFactor) {
3892cbc15d9SMark         PetscScalar                       *AALo;
3902cbc15d9SMark 
3912cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
39457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3989ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3999ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4009ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4019ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4029ae82921SPaul Mullowney         v        = aa;
4039ae82921SPaul Mullowney         vi       = aj;
4049ae82921SPaul Mullowney         offset   = 1;
4059ae82921SPaul Mullowney         rowOffset= 1;
4069ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4079ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
408e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4099ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4109ae82921SPaul Mullowney           rowOffset += nz+1;
4119ae82921SPaul Mullowney 
412580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
413580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4149ae82921SPaul Mullowney 
4159ae82921SPaul Mullowney           offset      += nz;
4169ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4179ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4189ae82921SPaul Mullowney           offset      += 1;
4199ae82921SPaul Mullowney 
4209ae82921SPaul Mullowney           v  += nz;
4219ae82921SPaul Mullowney           vi += nz;
4229ae82921SPaul Mullowney         }
4232205254eSKarl Rupp 
424aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
425da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
426da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
427aa372e3fSPaul Mullowney         /* Create the matrix description */
42857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
42957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4301b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
431afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
432afb2bd1cSJunchao Zhang        #else
43357d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
434afb2bd1cSJunchao Zhang        #endif
43557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
43657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
437aa372e3fSPaul Mullowney 
438aa372e3fSPaul Mullowney         /* set the operation */
439aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
440aa372e3fSPaul Mullowney 
441aa372e3fSPaul Mullowney         /* set the matrix */
442aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
443aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
444aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
445aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
446aa372e3fSPaul Mullowney 
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
449aa372e3fSPaul Mullowney 
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
452aa372e3fSPaul Mullowney 
453aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
454aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
455aa372e3fSPaul Mullowney 
456afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
457da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4591b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
460afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
461afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
462afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
463afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
464afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
465afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
466afb2bd1cSJunchao Zhang       #endif
467afb2bd1cSJunchao Zhang 
468aa372e3fSPaul Mullowney         /* perform the solve analysis */
469aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
470aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
471aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
472d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
4731b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
474d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
475d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
476d49cd2b7SBarry Smith                                #else
477d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
478afb2bd1cSJunchao Zhang                                #endif
479da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
480da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
481aa372e3fSPaul Mullowney 
482da79fbbcSStefano Zampini         /* assign the pointer */
483aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4842cbc15d9SMark         loTriFactor->AA_h = AALo;
48557d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
48657d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4874863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488da79fbbcSStefano Zampini       } else { /* update values only */
4892cbc15d9SMark         if (!loTriFactor->AA_h) {
4902cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4912cbc15d9SMark         }
492da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4932cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
494da79fbbcSStefano Zampini         v        = aa;
495da79fbbcSStefano Zampini         vi       = aj;
496da79fbbcSStefano Zampini         offset   = 1;
497da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
498da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4992cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
500da79fbbcSStefano Zampini           offset      += nz;
5012cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
502da79fbbcSStefano Zampini           offset      += 1;
503da79fbbcSStefano Zampini           v  += nz;
504da79fbbcSStefano Zampini         }
5052cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
506da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
507da79fbbcSStefano Zampini       }
5089ae82921SPaul Mullowney     } catch(char *ex) {
5099ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5109ae82921SPaul Mullowney     }
5119ae82921SPaul Mullowney   }
5129ae82921SPaul Mullowney   PetscFunctionReturn(0);
5139ae82921SPaul Mullowney }
5149ae82921SPaul Mullowney 
515087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5169ae82921SPaul Mullowney {
5179ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5189ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5199ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
520aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5219ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5229ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5239ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5249ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5259ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5269ae82921SPaul Mullowney   PetscErrorCode                    ierr;
52757d48284SJunchao Zhang   cudaError_t                       cerr;
5289ae82921SPaul Mullowney 
5299ae82921SPaul Mullowney   PetscFunctionBegin;
530cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
531c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5329ae82921SPaul Mullowney     try {
5339ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5349ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
535da79fbbcSStefano Zampini       if (!upTriFactor) {
5362cbc15d9SMark         PetscScalar *AAUp;
5372cbc15d9SMark 
5382cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5392cbc15d9SMark 
5409ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
54157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
54257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5439ae82921SPaul Mullowney 
5449ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5459ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5469ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5479ae82921SPaul Mullowney         offset = nzUpper;
5489ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5499ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5509ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5519ae82921SPaul Mullowney 
552e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5539ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5549ae82921SPaul Mullowney 
555e057df02SPaul Mullowney           /* decrement the offset */
5569ae82921SPaul Mullowney           offset -= (nz+1);
5579ae82921SPaul Mullowney 
558e057df02SPaul Mullowney           /* first, set the diagonal elements */
5599ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
56009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5619ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5629ae82921SPaul Mullowney 
563580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
564580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5659ae82921SPaul Mullowney         }
5662205254eSKarl Rupp 
567aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
568da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
569da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5702205254eSKarl Rupp 
571aa372e3fSPaul Mullowney         /* Create the matrix description */
57257d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
57357d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5741b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
575afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
576afb2bd1cSJunchao Zhang        #else
57757d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
578afb2bd1cSJunchao Zhang        #endif
57957d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
58057d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
581aa372e3fSPaul Mullowney 
582aa372e3fSPaul Mullowney         /* set the operation */
583aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584aa372e3fSPaul Mullowney 
585aa372e3fSPaul Mullowney         /* set the matrix */
586aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
589aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
590aa372e3fSPaul Mullowney 
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593aa372e3fSPaul Mullowney 
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596aa372e3fSPaul Mullowney 
597aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599aa372e3fSPaul Mullowney 
600afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
601da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6031b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
604afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
608afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
609afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
610afb2bd1cSJunchao Zhang       #endif
611afb2bd1cSJunchao Zhang 
612aa372e3fSPaul Mullowney         /* perform the solve analysis */
613aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6171b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
619d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
620d49cd2b7SBarry Smith                                #else
621d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
622afb2bd1cSJunchao Zhang                                #endif
623da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
624da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
625aa372e3fSPaul Mullowney 
626da79fbbcSStefano Zampini         /* assign the pointer */
627aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6282cbc15d9SMark         upTriFactor->AA_h = AAUp;
62957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
63057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6314863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632da79fbbcSStefano Zampini       } else {
6332cbc15d9SMark         if (!upTriFactor->AA_h) {
6342cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6352cbc15d9SMark         }
636da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
637da79fbbcSStefano Zampini         offset = nzUpper;
638da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
639da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
642da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
643da79fbbcSStefano Zampini 
644da79fbbcSStefano Zampini           /* decrement the offset */
645da79fbbcSStefano Zampini           offset -= (nz+1);
646da79fbbcSStefano Zampini 
647da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6482cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6492cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
650da79fbbcSStefano Zampini         }
6512cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
652da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
653da79fbbcSStefano Zampini       }
6549ae82921SPaul Mullowney     } catch(char *ex) {
6559ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6569ae82921SPaul Mullowney     }
6579ae82921SPaul Mullowney   }
6589ae82921SPaul Mullowney   PetscFunctionReturn(0);
6599ae82921SPaul Mullowney }
6609ae82921SPaul Mullowney 
661087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6629ae82921SPaul Mullowney {
6639ae82921SPaul Mullowney   PetscErrorCode               ierr;
6649ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6659ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6669ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6679ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6689ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6699ae82921SPaul Mullowney 
6709ae82921SPaul Mullowney   PetscFunctionBegin;
671da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
672087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
673087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6742205254eSKarl Rupp 
675da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
676aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6779ae82921SPaul Mullowney 
678c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
679e057df02SPaul Mullowney   /* lower triangular indices */
6809ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
681da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
682da79fbbcSStefano Zampini     const PetscInt *r;
683da79fbbcSStefano Zampini 
684da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
685aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
686aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6879ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
688da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
689da79fbbcSStefano Zampini   }
6909ae82921SPaul Mullowney 
691e057df02SPaul Mullowney   /* upper triangular indices */
6929ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
693da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
694da79fbbcSStefano Zampini     const PetscInt *c;
695da79fbbcSStefano Zampini 
696da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
697aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
698aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6999ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
700da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
701da79fbbcSStefano Zampini   }
7029ae82921SPaul Mullowney   PetscFunctionReturn(0);
7039ae82921SPaul Mullowney }
7049ae82921SPaul Mullowney 
705087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
706087f3262SPaul Mullowney {
707087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
708087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
709aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
710aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
711087f3262SPaul Mullowney   cusparseStatus_t                  stat;
712087f3262SPaul Mullowney   PetscErrorCode                    ierr;
71357d48284SJunchao Zhang   cudaError_t                       cerr;
714087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
715087f3262SPaul Mullowney   PetscScalar                       *AAUp;
716087f3262SPaul Mullowney   PetscScalar                       *AALo;
717087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
718087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
719087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
720087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
721087f3262SPaul Mullowney 
722087f3262SPaul Mullowney   PetscFunctionBegin;
723cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
724c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
725087f3262SPaul Mullowney     try {
726da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
728da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
729087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
73057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
73157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
732087f3262SPaul Mullowney 
733087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
734087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
735087f3262SPaul Mullowney         AiUp[n]=nzUpper;
736087f3262SPaul Mullowney         offset = 0;
737087f3262SPaul Mullowney         for (i=0; i<n; i++) {
738087f3262SPaul Mullowney           /* set the pointers */
739087f3262SPaul Mullowney           v  = aa + ai[i];
740087f3262SPaul Mullowney           vj = aj + ai[i];
741087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
742087f3262SPaul Mullowney 
743087f3262SPaul Mullowney           /* first, set the diagonal elements */
744087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
74509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
746087f3262SPaul Mullowney           AiUp[i]      = offset;
74709f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
748087f3262SPaul Mullowney 
749087f3262SPaul Mullowney           offset+=1;
750087f3262SPaul Mullowney           if (nz>0) {
751f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
752580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
753087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
754087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
755087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
756087f3262SPaul Mullowney             }
757087f3262SPaul Mullowney             offset+=nz;
758087f3262SPaul Mullowney           }
759087f3262SPaul Mullowney         }
760087f3262SPaul Mullowney 
761aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
762da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
763da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
764087f3262SPaul Mullowney 
765aa372e3fSPaul Mullowney         /* Create the matrix description */
76657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
76757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7681b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
770afb2bd1cSJunchao Zhang        #else
77157d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
772afb2bd1cSJunchao Zhang        #endif
77357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
77457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
775087f3262SPaul Mullowney 
776aa372e3fSPaul Mullowney         /* set the matrix */
777aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
778aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
781aa372e3fSPaul Mullowney 
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
783aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
784aa372e3fSPaul Mullowney 
785aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
786aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
787aa372e3fSPaul Mullowney 
788aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
789aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
790aa372e3fSPaul Mullowney 
791afb2bd1cSJunchao Zhang         /* set the operation */
792afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
793afb2bd1cSJunchao Zhang 
794afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
795da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7971b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
798afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
799afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
800afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
801afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
802afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
803afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
804afb2bd1cSJunchao Zhang       #endif
805afb2bd1cSJunchao Zhang 
806aa372e3fSPaul Mullowney         /* perform the solve analysis */
807aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
808aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
809aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
810d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8111b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
812d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
813d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
814d49cd2b7SBarry Smith                                 #else
815d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
816afb2bd1cSJunchao Zhang                                 #endif
817da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
818da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
819aa372e3fSPaul Mullowney 
820da79fbbcSStefano Zampini         /* assign the pointer */
821aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
824da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
825da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826aa372e3fSPaul Mullowney 
827aa372e3fSPaul Mullowney         /* Create the matrix description */
82857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
82957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8301b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832afb2bd1cSJunchao Zhang        #else
83357d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834afb2bd1cSJunchao Zhang        #endif
83557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney         /* set the operation */
839aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
840aa372e3fSPaul Mullowney 
841aa372e3fSPaul Mullowney         /* set the matrix */
842aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
844aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
848aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
852aa372e3fSPaul Mullowney 
853aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
855aa372e3fSPaul Mullowney 
856afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
857da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8591b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
861afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
862afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
863afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
864afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866afb2bd1cSJunchao Zhang       #endif
867afb2bd1cSJunchao Zhang 
868aa372e3fSPaul Mullowney         /* perform the solve analysis */
869aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
870aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
871aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
872d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
8731b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
875d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876d49cd2b7SBarry Smith                                 #else
877d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878afb2bd1cSJunchao Zhang                                 #endif
879da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881aa372e3fSPaul Mullowney 
882da79fbbcSStefano Zampini         /* assign the pointer */
883aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
884087f3262SPaul Mullowney 
885da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
88657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
88757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
888da79fbbcSStefano Zampini       } else {
889da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
890da79fbbcSStefano Zampini         offset = 0;
891da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
892da79fbbcSStefano Zampini           /* set the pointers */
893da79fbbcSStefano Zampini           v  = aa + ai[i];
894da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
895da79fbbcSStefano Zampini 
896da79fbbcSStefano Zampini           /* first, set the diagonal elements */
897da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
898da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
899da79fbbcSStefano Zampini 
900da79fbbcSStefano Zampini           offset+=1;
901da79fbbcSStefano Zampini           if (nz>0) {
902da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
903da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
904da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
905da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
906da79fbbcSStefano Zampini             }
907da79fbbcSStefano Zampini             offset+=nz;
908da79fbbcSStefano Zampini           }
909da79fbbcSStefano Zampini         }
910da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
912da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
913da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
915da79fbbcSStefano Zampini       }
91657d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
91757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
918087f3262SPaul Mullowney     } catch(char *ex) {
919087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
920087f3262SPaul Mullowney     }
921087f3262SPaul Mullowney   }
922087f3262SPaul Mullowney   PetscFunctionReturn(0);
923087f3262SPaul Mullowney }
924087f3262SPaul Mullowney 
925087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9269ae82921SPaul Mullowney {
9279ae82921SPaul Mullowney   PetscErrorCode               ierr;
928087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
929087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
930087f3262SPaul Mullowney   IS                           ip = a->row;
931087f3262SPaul Mullowney   PetscBool                    perm_identity;
932087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
933087f3262SPaul Mullowney 
934087f3262SPaul Mullowney   PetscFunctionBegin;
935da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
936087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
937da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
938aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
939aa372e3fSPaul Mullowney 
940da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
941da79fbbcSStefano Zampini 
942087f3262SPaul Mullowney   /* lower triangular indices */
943087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
944087f3262SPaul Mullowney   if (!perm_identity) {
9454e4bbfaaSStefano Zampini     IS             iip;
946da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9474e4bbfaaSStefano Zampini 
9484e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9494e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
950da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
951aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
952aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
953aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9544e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9554e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9564e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
957087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
958da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
959da79fbbcSStefano Zampini   }
960087f3262SPaul Mullowney   PetscFunctionReturn(0);
961087f3262SPaul Mullowney }
962087f3262SPaul Mullowney 
963087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
964087f3262SPaul Mullowney {
965087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
966087f3262SPaul Mullowney   IS             ip = b->row;
967087f3262SPaul Mullowney   PetscBool      perm_identity;
968b175d8bbSPaul Mullowney   PetscErrorCode ierr;
969087f3262SPaul Mullowney 
970087f3262SPaul Mullowney   PetscFunctionBegin;
97157181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
972087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
973ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
974087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
975087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
976087f3262SPaul Mullowney   if (perm_identity) {
977087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
978087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9794e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9804e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
981087f3262SPaul Mullowney   } else {
982087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
983087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9844e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9854e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
986087f3262SPaul Mullowney   }
987087f3262SPaul Mullowney 
988087f3262SPaul Mullowney   /* get the triangular factors */
989087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
990087f3262SPaul Mullowney   PetscFunctionReturn(0);
991087f3262SPaul Mullowney }
9929ae82921SPaul Mullowney 
993b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
994bda325fcSPaul Mullowney {
995bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
996aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
997aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
998da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
999da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1000bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1001aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1002aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1003aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1004aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10051b0a6780SStefano Zampini   cudaError_t                       cerr;
1006da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1007b175d8bbSPaul Mullowney 
1008bda325fcSPaul Mullowney   PetscFunctionBegin;
1009aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1010da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1011da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1012aa372e3fSPaul Mullowney 
1013aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1014aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1015aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1016aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1017aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1018aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* Create the matrix description */
102157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102357d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1026aa372e3fSPaul Mullowney 
1027aa372e3fSPaul Mullowney   /* set the operation */
1028aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1029aa372e3fSPaul Mullowney 
1030aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1031aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1032afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1033afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1034aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1035afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1036afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1038aa372e3fSPaul Mullowney 
1039aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1040afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1041afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1042afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1043afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1044afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1045afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1046afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1047afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1048afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1049afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10501b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1051afb2bd1cSJunchao Zhang #endif
1052afb2bd1cSJunchao Zhang 
1053da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1054aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1055aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1056aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1057aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1058aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1059aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1060afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1061afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1062afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1063d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1064afb2bd1cSJunchao Zhang                         #else
1065afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1066d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1067afb2bd1cSJunchao Zhang                         #endif
1068da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1069da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1070aa372e3fSPaul Mullowney 
1071afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1072da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1073afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10741b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1075afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1079afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1080afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1081afb2bd1cSJunchao Zhang #endif
1082afb2bd1cSJunchao Zhang 
1083afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1084aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1085afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1086afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1087d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
10881b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1089d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1090d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1091d49cd2b7SBarry Smith                           #else
1092d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1093afb2bd1cSJunchao Zhang                           #endif
1094da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1095da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1096aa372e3fSPaul Mullowney 
1097da79fbbcSStefano Zampini   /* assign the pointer */
1098aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1099aa372e3fSPaul Mullowney 
1100aa372e3fSPaul Mullowney   /*********************************************/
1101aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1102aa372e3fSPaul Mullowney   /*********************************************/
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1105da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1106da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1107aa372e3fSPaul Mullowney 
1108aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1109aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1110aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1111aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1112aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1113aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1114aa372e3fSPaul Mullowney 
1115aa372e3fSPaul Mullowney   /* Create the matrix description */
111657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
111757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
111857d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
111957d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112057d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1121aa372e3fSPaul Mullowney 
1122aa372e3fSPaul Mullowney   /* set the operation */
1123aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1126aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1127afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1128afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1129aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1130afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1132afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1133aa372e3fSPaul Mullowney 
1134aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1135afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1136afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1137afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1138afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1139afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1140afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1141afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1142afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1143afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1144afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1145afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1146afb2bd1cSJunchao Zhang #endif
1147afb2bd1cSJunchao Zhang 
1148da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1150aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1151aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1152aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1153aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1154aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1155afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1156afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1157afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1158d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1159afb2bd1cSJunchao Zhang                         #else
1160afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1161d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1162afb2bd1cSJunchao Zhang                         #endif
1163d49cd2b7SBarry Smith 
1164da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1165da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1166aa372e3fSPaul Mullowney 
1167afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1168da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1169afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11701b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1171afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1175afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1176afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1177afb2bd1cSJunchao Zhang   #endif
1178afb2bd1cSJunchao Zhang 
1179afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1180aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1181afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1182afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1183d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
11841b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1185d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1186d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1187d49cd2b7SBarry Smith                           #else
1188d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1189afb2bd1cSJunchao Zhang                           #endif
1190d49cd2b7SBarry Smith 
1191da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193aa372e3fSPaul Mullowney 
1194da79fbbcSStefano Zampini   /* assign the pointer */
1195aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1197bda325fcSPaul Mullowney }
1198bda325fcSPaul Mullowney 
1199a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1200a49f1ed0SStefano Zampini {
1201a49f1ed0SStefano Zampini   __host__ __device__
1202a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1203a49f1ed0SStefano Zampini   {
1204a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1205a49f1ed0SStefano Zampini   }
1206a49f1ed0SStefano Zampini };
1207a49f1ed0SStefano Zampini 
12083606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1209bda325fcSPaul Mullowney {
1210aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1211a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1212bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1213bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1214aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1215b06137fdSPaul Mullowney   cudaError_t                  err;
121685ba7357SStefano Zampini   PetscErrorCode               ierr;
1217b175d8bbSPaul Mullowney 
1218bda325fcSPaul Mullowney   PetscFunctionBegin;
1219a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1220a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1221e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1222a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1223e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12241a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
122585ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1226ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1227a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1228a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1229a49f1ed0SStefano Zampini   }
1230a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1231aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123257d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1233aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123457d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123557d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1236aa372e3fSPaul Mullowney 
1237b06137fdSPaul Mullowney     /* set alpha and beta */
1238afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12397656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12407656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1241afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12427656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12437656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1244b06137fdSPaul Mullowney 
1245aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1246aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1247a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1248554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1249554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1250aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1251a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1252aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1253aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1254a3fdcf43SKarl Rupp 
1255039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1257afb2bd1cSJunchao Zhang 
1258afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12593606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1260afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1261afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1264afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
12663606e59fSJunchao Zhang       #else
12673606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12683606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12693606e59fSJunchao Zhang 
12703606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12713606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12723606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12733606e59fSJunchao Zhang         */
12743606e59fSJunchao Zhang         if (matrixT->num_entries) {
12753606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
12763606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
12773606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
12783606e59fSJunchao Zhang                                  matrixT->values->data().get(),
12793606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
12803606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
12813606e59fSJunchao Zhang 
12823606e59fSJunchao Zhang         } else {
12833606e59fSJunchao Zhang           matstructT->matDescr = NULL;
12843606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12853606e59fSJunchao Zhang         }
12863606e59fSJunchao Zhang       #endif
1287afb2bd1cSJunchao Zhang      #endif
1288aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1289afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1290afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1291afb2bd1cSJunchao Zhang    #else
1292aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
129351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
129451c6d536SStefano Zampini       /* First convert HYB to CSR */
1295aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1296aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1297aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1298aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1300aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1303aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1304aa372e3fSPaul Mullowney                               temp->values->data().get(),
1305aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
130657d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1307aa372e3fSPaul Mullowney 
1308aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1309aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1310aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1311aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1312aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1313aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1314aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1315aa372e3fSPaul Mullowney 
1316aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1317aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1318aa372e3fSPaul Mullowney                               temp->values->data().get(),
1319aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1320aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1321aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1322aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1323aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
132457d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1325aa372e3fSPaul Mullowney 
1326aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1327aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
132857d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1329aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1330aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1331aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1332aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1333aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1334aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
133557d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1336aa372e3fSPaul Mullowney 
1337aa372e3fSPaul Mullowney       /* assign the pointer */
1338aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13391a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1340aa372e3fSPaul Mullowney       /* delete temporaries */
1341aa372e3fSPaul Mullowney       if (tempT) {
1342aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1343aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1344aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1345aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1346087f3262SPaul Mullowney       }
1347aa372e3fSPaul Mullowney       if (temp) {
1348aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1349aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1350aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1351aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1352aa372e3fSPaul Mullowney       }
1353afb2bd1cSJunchao Zhang      #endif
1354aa372e3fSPaul Mullowney     }
1355a49f1ed0SStefano Zampini   }
1356a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1357a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1358a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1359e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1360e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1361e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1362e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1363e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1364e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1365e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1366e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1367a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1368a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1369a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1370a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1371a49f1ed0SStefano Zampini     }
1372a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1373a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1374a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1375a49f1ed0SStefano Zampini 
1376a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1377a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1378a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1379a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1380a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1381a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1382a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1383a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1384a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1385a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1386a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1387a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1388a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1389a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1390a49f1ed0SStefano Zampini      #endif
1391a49f1ed0SStefano Zampini 
13921a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13931a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13941a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13951a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13961a2c6b5cSJunchao Zhang 
13971a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13981a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13991a2c6b5cSJunchao Zhang         */
14001a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14011a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14021a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14031a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14041a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1405a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1406a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1408a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14091a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1410a49f1ed0SStefano Zampini                              #else
1411a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14121a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1413a49f1ed0SStefano Zampini                              #endif
14141a2c6b5cSJunchao Zhang       } else {
14151a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14161a2c6b5cSJunchao Zhang       }
14171a2c6b5cSJunchao Zhang 
1418a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1419a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1420a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1421a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1422a49f1ed0SStefano Zampini      #endif
1423a49f1ed0SStefano Zampini     }
1424a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1425a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1426a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1427a49f1ed0SStefano Zampini   }
1428ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
142985ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1430213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1431213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1432aa372e3fSPaul Mullowney   /* assign the pointer */
1433aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14341a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1435bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1436bda325fcSPaul Mullowney }
1437bda325fcSPaul Mullowney 
1438a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14396fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1440bda325fcSPaul Mullowney {
1441c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1442465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1443465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1444465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1445465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1446bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1447bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1448aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1449aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1450aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1451b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1452bda325fcSPaul Mullowney 
1453bda325fcSPaul Mullowney   PetscFunctionBegin;
1454aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1455aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1456bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1457aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1458aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1459bda325fcSPaul Mullowney   }
1460bda325fcSPaul Mullowney 
1461bda325fcSPaul Mullowney   /* Get the GPU pointers */
1462c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1463c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1464c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1465c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1466bda325fcSPaul Mullowney 
14677a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1469a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1470c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1471c41cb2e2SAlejandro Lamas Daviña                xGPU);
1472aa372e3fSPaul Mullowney 
1473aa372e3fSPaul Mullowney   /* First, solve U */
1474aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1475afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14761b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1478afb2bd1cSJunchao Zhang                       #endif
1479afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1480aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1481aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1482aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1483aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1484d49cd2b7SBarry Smith                         xarray,
14851b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486d49cd2b7SBarry Smith                         tempGPU->data().get(),
1487d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1488d49cd2b7SBarry Smith                       #else
1489d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1490afb2bd1cSJunchao Zhang                       #endif
1491aa372e3fSPaul Mullowney 
1492aa372e3fSPaul Mullowney   /* Then, solve L */
1493aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1494afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14951b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1496afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1497afb2bd1cSJunchao Zhang                       #endif
1498afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1499aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1500aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1501aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1502aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1503d49cd2b7SBarry Smith                         tempGPU->data().get(),
15041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505d49cd2b7SBarry Smith                         xarray,
1506d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1507d49cd2b7SBarry Smith                       #else
1508d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1509afb2bd1cSJunchao Zhang                       #endif
1510aa372e3fSPaul Mullowney 
1511aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1512a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1513c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1514aa372e3fSPaul Mullowney                tempGPU->begin());
1515aa372e3fSPaul Mullowney 
1516aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1517a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1518bda325fcSPaul Mullowney 
1519bda325fcSPaul Mullowney   /* restore */
1520c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1523958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1524bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1525bda325fcSPaul Mullowney }
1526bda325fcSPaul Mullowney 
15276fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1528bda325fcSPaul Mullowney {
1529465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1530465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1531bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1532bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1533aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1534aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1535aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1536b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1537bda325fcSPaul Mullowney 
1538bda325fcSPaul Mullowney   PetscFunctionBegin;
1539aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1540aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1541bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1542aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1543aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1544bda325fcSPaul Mullowney   }
1545bda325fcSPaul Mullowney 
1546bda325fcSPaul Mullowney   /* Get the GPU pointers */
1547c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1548c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1549bda325fcSPaul Mullowney 
15507a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1551aa372e3fSPaul Mullowney   /* First, solve U */
1552aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1553afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1556afb2bd1cSJunchao Zhang                       #endif
1557afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1558aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1559aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1560aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1561aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1562d49cd2b7SBarry Smith                         barray,
15631b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564d49cd2b7SBarry Smith                         tempGPU->data().get(),
1565d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566d49cd2b7SBarry Smith                       #else
1567d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1568afb2bd1cSJunchao Zhang                       #endif
1569aa372e3fSPaul Mullowney 
1570aa372e3fSPaul Mullowney   /* Then, solve L */
1571aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1572afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15731b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1574afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1575afb2bd1cSJunchao Zhang                       #endif
1576afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1577aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1578aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1579aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1580aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1581d49cd2b7SBarry Smith                         tempGPU->data().get(),
15821b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1583d49cd2b7SBarry Smith                         xarray,
1584d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1585d49cd2b7SBarry Smith                       #else
1586d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1587afb2bd1cSJunchao Zhang                       #endif
1588bda325fcSPaul Mullowney 
1589bda325fcSPaul Mullowney   /* restore */
1590c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1591c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1592661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1593958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1594bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1595bda325fcSPaul Mullowney }
1596bda325fcSPaul Mullowney 
15976fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15989ae82921SPaul Mullowney {
1599465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1600465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1601465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1602465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16039ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16049ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1605aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1606aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1607aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1608b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16099ae82921SPaul Mullowney 
16109ae82921SPaul Mullowney   PetscFunctionBegin;
1611ebc8f436SDominic Meiser 
1612e057df02SPaul Mullowney   /* Get the GPU pointers */
1613c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1614c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1615c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1616c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16179ae82921SPaul Mullowney 
16187a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1619aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1620a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1621c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16224e4bbfaaSStefano Zampini                tempGPU->begin());
1623aa372e3fSPaul Mullowney 
1624aa372e3fSPaul Mullowney   /* Next, solve L */
1625aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1626afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1629afb2bd1cSJunchao Zhang                       #endif
1630afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1631aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1632aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1633aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1634aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1635d49cd2b7SBarry Smith                         tempGPU->data().get(),
16361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1637d49cd2b7SBarry Smith                          xarray,
1638d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1639d49cd2b7SBarry Smith                       #else
1640d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1641afb2bd1cSJunchao Zhang                       #endif
1642aa372e3fSPaul Mullowney 
1643aa372e3fSPaul Mullowney   /* Then, solve U */
1644aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1645afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1648afb2bd1cSJunchao Zhang                       #endif
1649afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1650aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1651aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1652aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1653d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1655d49cd2b7SBarry Smith                         tempGPU->data().get(),
1656d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1657d49cd2b7SBarry Smith                       #else
1658d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1659afb2bd1cSJunchao Zhang                       #endif
1660d49cd2b7SBarry Smith 
16614e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1662a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16634e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16644e4bbfaaSStefano Zampini                xGPU);
16659ae82921SPaul Mullowney 
1666c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1667c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1668661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1669958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16709ae82921SPaul Mullowney   PetscFunctionReturn(0);
16719ae82921SPaul Mullowney }
16729ae82921SPaul Mullowney 
16736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16749ae82921SPaul Mullowney {
1675465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1676465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16779ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1679aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1680aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1681aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1682b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
16839ae82921SPaul Mullowney 
16849ae82921SPaul Mullowney   PetscFunctionBegin;
1685e057df02SPaul Mullowney   /* Get the GPU pointers */
1686c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1687c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16889ae82921SPaul Mullowney 
16897a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1690aa372e3fSPaul Mullowney   /* First, solve L */
1691aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1692afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16931b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1694afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1695afb2bd1cSJunchao Zhang                       #endif
1696afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1697aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1698aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1699aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1700aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1701d49cd2b7SBarry Smith                         barray,
17021b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1703d49cd2b7SBarry Smith                         tempGPU->data().get(),
1704d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1705d49cd2b7SBarry Smith                       #else
1706d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1707afb2bd1cSJunchao Zhang                       #endif
1708d49cd2b7SBarry Smith 
1709aa372e3fSPaul Mullowney   /* Next, solve U */
1710aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1711afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1714afb2bd1cSJunchao Zhang                       #endif
1715afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1716aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1717aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1718aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1719aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1720d49cd2b7SBarry Smith                         tempGPU->data().get(),
17211b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1722d49cd2b7SBarry Smith                         xarray,
1723d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1724d49cd2b7SBarry Smith                       #else
1725d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1726afb2bd1cSJunchao Zhang                       #endif
17279ae82921SPaul Mullowney 
1728c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17329ae82921SPaul Mullowney   PetscFunctionReturn(0);
17339ae82921SPaul Mullowney }
17349ae82921SPaul Mullowney 
17357e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17367e8381f9SStefano Zampini {
17377e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17387e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17397e8381f9SStefano Zampini   cudaError_t        cerr;
17407e8381f9SStefano Zampini   PetscErrorCode     ierr;
17417e8381f9SStefano Zampini 
17427e8381f9SStefano Zampini   PetscFunctionBegin;
17437e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17447e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17457e8381f9SStefano Zampini 
17467e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17477e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17487e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17497e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17507e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17517e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17527e8381f9SStefano Zampini   }
17537e8381f9SStefano Zampini   PetscFunctionReturn(0);
17547e8381f9SStefano Zampini }
17557e8381f9SStefano Zampini 
17567e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17577e8381f9SStefano Zampini {
17587e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17597e8381f9SStefano Zampini   PetscErrorCode ierr;
17607e8381f9SStefano Zampini 
17617e8381f9SStefano Zampini   PetscFunctionBegin;
17627e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17637e8381f9SStefano Zampini   *array = a->a;
17647e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17657e8381f9SStefano Zampini   PetscFunctionReturn(0);
17667e8381f9SStefano Zampini }
17677e8381f9SStefano Zampini 
1768042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17699ae82921SPaul Mullowney {
1770aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17717c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17729ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1773213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17749ae82921SPaul Mullowney   PetscErrorCode               ierr;
1775aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1776abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1777b06137fdSPaul Mullowney   cudaError_t                  err;
17789ae82921SPaul Mullowney 
17799ae82921SPaul Mullowney   PetscFunctionBegin;
1780e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1781c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1782a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1783a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1784afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
178585ba7357SStefano Zampini 
1786e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
178785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1788afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
178905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17904863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
179185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1792a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
179334d6c7a5SJose E. Roman     } else {
1794abb89eb1SStefano Zampini       PetscInt nnz;
179585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17967c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1797a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17987c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
179981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1800a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1801a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18029ae82921SPaul Mullowney       try {
18039ae82921SPaul Mullowney         if (a->compressedrow.use) {
18049ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18059ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18069ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18079ae82921SPaul Mullowney         } else {
1808213423ffSJunchao Zhang           m    = A->rmap->n;
1809213423ffSJunchao Zhang           ii   = a->i;
1810e6e9a74fSStefano Zampini           ridx = NULL;
18119ae82921SPaul Mullowney         }
1812e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1813e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1814abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1815abb89eb1SStefano Zampini         else nnz = a->nz;
18169ae82921SPaul Mullowney 
181785ba7357SStefano Zampini         /* create cusparse matrix */
1818abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1819aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
182057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
182157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
182257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
18239ae82921SPaul Mullowney 
1824afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
18257656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
18267656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1827afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18287656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18297656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
183057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1831b06137fdSPaul Mullowney 
1832aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1833aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1834aa372e3fSPaul Mullowney           /* set the matrix */
1835afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1836afb2bd1cSJunchao Zhang           mat->num_rows = m;
1837afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1838abb89eb1SStefano Zampini           mat->num_entries = nnz;
1839afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1840afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18419ae82921SPaul Mullowney 
1842abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1843abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1844aa372e3fSPaul Mullowney 
1845abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1846abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1847aa372e3fSPaul Mullowney 
1848aa372e3fSPaul Mullowney           /* assign the pointer */
1849afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1850afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1851afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1852afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1853afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1854afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1855afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1856afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1857afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1858afb2bd1cSJunchao Zhang           }
1859afb2bd1cSJunchao Zhang          #endif
1860aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1861afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1862afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1863afb2bd1cSJunchao Zhang          #else
1864afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1865afb2bd1cSJunchao Zhang           mat->num_rows = m;
1866afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1867abb89eb1SStefano Zampini           mat->num_entries = nnz;
1868afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1869afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1870aa372e3fSPaul Mullowney 
1871abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1872abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1873aa372e3fSPaul Mullowney 
1874abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1875abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1876aa372e3fSPaul Mullowney 
1877aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
187857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1879aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1880aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1881afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1882afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1883afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1884afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
188557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1886aa372e3fSPaul Mullowney           /* assign the pointer */
1887aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1888aa372e3fSPaul Mullowney 
1889afb2bd1cSJunchao Zhang           if (mat) {
1890afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1891afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1892afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1893afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1894087f3262SPaul Mullowney           }
1895afb2bd1cSJunchao Zhang          #endif
1896087f3262SPaul Mullowney         }
1897ca45077fSPaul Mullowney 
1898aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1899213423ffSJunchao Zhang         if (a->compressedrow.use) {
1900213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1901aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1902aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1903213423ffSJunchao Zhang           tmp = m;
1904213423ffSJunchao Zhang         } else {
1905213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1906213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1907213423ffSJunchao Zhang           tmp = 0;
1908213423ffSJunchao Zhang         }
1909213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1910aa372e3fSPaul Mullowney 
1911aa372e3fSPaul Mullowney         /* assign the pointer */
1912aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19139ae82921SPaul Mullowney       } catch(char *ex) {
19149ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19159ae82921SPaul Mullowney       }
191605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
191785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
191834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
191934d6c7a5SJose E. Roman     }
1920abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19219ae82921SPaul Mullowney   }
19229ae82921SPaul Mullowney   PetscFunctionReturn(0);
19239ae82921SPaul Mullowney }
19249ae82921SPaul Mullowney 
1925c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1926aa372e3fSPaul Mullowney {
1927aa372e3fSPaul Mullowney   template <typename Tuple>
1928aa372e3fSPaul Mullowney   __host__ __device__
1929aa372e3fSPaul Mullowney   void operator()(Tuple t)
1930aa372e3fSPaul Mullowney   {
1931aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1932aa372e3fSPaul Mullowney   }
1933aa372e3fSPaul Mullowney };
1934aa372e3fSPaul Mullowney 
19357e8381f9SStefano Zampini struct VecCUDAEquals
19367e8381f9SStefano Zampini {
19377e8381f9SStefano Zampini   template <typename Tuple>
19387e8381f9SStefano Zampini   __host__ __device__
19397e8381f9SStefano Zampini   void operator()(Tuple t)
19407e8381f9SStefano Zampini   {
19417e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19427e8381f9SStefano Zampini   }
19437e8381f9SStefano Zampini };
19447e8381f9SStefano Zampini 
1945e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1946e6e9a74fSStefano Zampini {
1947e6e9a74fSStefano Zampini   template <typename Tuple>
1948e6e9a74fSStefano Zampini   __host__ __device__
1949e6e9a74fSStefano Zampini   void operator()(Tuple t)
1950e6e9a74fSStefano Zampini   {
1951e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1952e6e9a74fSStefano Zampini   }
1953e6e9a74fSStefano Zampini };
1954e6e9a74fSStefano Zampini 
1955afb2bd1cSJunchao Zhang struct MatMatCusparse {
1956ccdfe979SStefano Zampini   PetscBool             cisdense;
1957ccdfe979SStefano Zampini   PetscScalar           *Bt;
1958ccdfe979SStefano Zampini   Mat                   X;
1959fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1960fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1961fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1962b4285af6SJunchao Zhang 
1963afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1964fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1965afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1966afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1967afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1968afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1969b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1970b4285af6SJunchao Zhang   void                  *dBuffer4;
1971b4285af6SJunchao Zhang   void                  *dBuffer5;
1972b4285af6SJunchao Zhang  #endif
1973fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1974fcdce8c4SStefano Zampini   void                  *mmBuffer;
1975fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1976fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1977afb2bd1cSJunchao Zhang #endif
1978afb2bd1cSJunchao Zhang };
1979ccdfe979SStefano Zampini 
1980ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1981ccdfe979SStefano Zampini {
1982ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1983ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1984ccdfe979SStefano Zampini   cudaError_t      cerr;
1985fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1986fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1987fcdce8c4SStefano Zampini  #endif
1988ccdfe979SStefano Zampini 
1989ccdfe979SStefano Zampini   PetscFunctionBegin;
1990ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1991fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1992afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1993fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1994afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1995afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1996fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1997b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
1998b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
1999b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2000b4285af6SJunchao Zhang  #endif
2001b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2002b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2003afb2bd1cSJunchao Zhang  #endif
2004ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2005ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2006ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2007ccdfe979SStefano Zampini }
2008ccdfe979SStefano Zampini 
2009ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2010ccdfe979SStefano Zampini 
2011ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2012ccdfe979SStefano Zampini {
2013ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2014ccdfe979SStefano Zampini   Mat                          A,B;
2015afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2016ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2017ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2018ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2019ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2020ccdfe979SStefano Zampini   const PetscScalar            *barray;
2021ccdfe979SStefano Zampini   PetscScalar                  *carray;
2022ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2023ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2024ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2025ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2026ccdfe979SStefano Zampini 
2027ccdfe979SStefano Zampini   PetscFunctionBegin;
2028ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2029e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2030ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2031ccdfe979SStefano Zampini   A    = product->A;
2032ccdfe979SStefano Zampini   B    = product->B;
2033ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2034e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2035ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2036ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2037ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2038ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2039ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2040ccdfe979SStefano Zampini   switch (product->type) {
2041ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2042ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2043ccdfe979SStefano Zampini     mat = cusp->mat;
2044ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2045ccdfe979SStefano Zampini     m   = A->rmap->n;
2046ccdfe979SStefano Zampini     n   = B->cmap->n;
2047ccdfe979SStefano Zampini     break;
2048ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20491a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2050e6e9a74fSStefano Zampini       mat = cusp->mat;
2051e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2052e6e9a74fSStefano Zampini     } else {
20533606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2054ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2055ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2056e6e9a74fSStefano Zampini     }
2057ccdfe979SStefano Zampini     m = A->cmap->n;
2058ccdfe979SStefano Zampini     n = B->cmap->n;
2059ccdfe979SStefano Zampini     break;
2060ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2061ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2062ccdfe979SStefano Zampini     mat = cusp->mat;
2063ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2064ccdfe979SStefano Zampini     m   = A->rmap->n;
2065ccdfe979SStefano Zampini     n   = B->rmap->n;
2066ccdfe979SStefano Zampini     break;
2067ccdfe979SStefano Zampini   default:
2068e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2069ccdfe979SStefano Zampini   }
2070e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2071ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2072ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2073ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2074afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2075ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2076afb2bd1cSJunchao Zhang 
2077ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2078c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2079c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2080c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2081c8378d12SStefano Zampini   } else {
2082c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2083c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2084c8378d12SStefano Zampini   }
2085c8378d12SStefano Zampini 
2086c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2087afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2088afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2089a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2090afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2091fcdce8c4SStefano Zampini     size_t mmBufferSize;
2092afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2093afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2094afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2095afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2096afb2bd1cSJunchao Zhang     }
2097c8378d12SStefano Zampini 
2098afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2099afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2100afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2101afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2102afb2bd1cSJunchao Zhang     }
2103afb2bd1cSJunchao Zhang 
2104afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2105afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2106afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2107afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2108afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2109afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2110afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2111afb2bd1cSJunchao Zhang     }
2112afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2113afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2114afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2115fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2116fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2117ee7b52eaSHong Zhang       cudaError_t cerr;
2118fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2119fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2120fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2121fcdce8c4SStefano Zampini     }
2122afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2123afb2bd1cSJunchao Zhang   } else {
2124afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2125afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2126afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2127afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2128afb2bd1cSJunchao Zhang   }
2129afb2bd1cSJunchao Zhang 
2130afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2131afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2132afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2133afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2134fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2135afb2bd1cSJunchao Zhang  #else
2136afb2bd1cSJunchao Zhang   PetscInt k;
2137afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2138ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2139ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2140ccdfe979SStefano Zampini     cublasStatus_t cerr;
2141ccdfe979SStefano Zampini 
2142ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2143ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2144ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2145ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2146ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2147ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2148ccdfe979SStefano Zampini     blda = B->cmap->n;
2149afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2150afb2bd1cSJunchao Zhang   } else {
2151afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2152ccdfe979SStefano Zampini   }
2153ccdfe979SStefano Zampini 
2154afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2155ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2156afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2157ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2158ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2159ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2160ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2161ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2162afb2bd1cSJunchao Zhang  #endif
2163c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2164c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2165ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2166ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2167ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2168ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2169ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2170ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2171ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2172ccdfe979SStefano Zampini   } else {
2173ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2174ccdfe979SStefano Zampini   }
2175ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2176ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2177ccdfe979SStefano Zampini   }
2178ccdfe979SStefano Zampini   if (!biscuda) {
2179ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2180ccdfe979SStefano Zampini   }
2181ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2182ccdfe979SStefano Zampini }
2183ccdfe979SStefano Zampini 
2184ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2185ccdfe979SStefano Zampini {
2186ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2187ccdfe979SStefano Zampini   Mat                A,B;
2188ccdfe979SStefano Zampini   PetscInt           m,n;
2189ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2190ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2191ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2192ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2193ccdfe979SStefano Zampini 
2194ccdfe979SStefano Zampini   PetscFunctionBegin;
2195ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2196e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2197ccdfe979SStefano Zampini   A    = product->A;
2198ccdfe979SStefano Zampini   B    = product->B;
2199ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2200e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2201ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2202e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2203ccdfe979SStefano Zampini   switch (product->type) {
2204ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2205ccdfe979SStefano Zampini     m = A->rmap->n;
2206ccdfe979SStefano Zampini     n = B->cmap->n;
2207ccdfe979SStefano Zampini     break;
2208ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2209ccdfe979SStefano Zampini     m = A->cmap->n;
2210ccdfe979SStefano Zampini     n = B->cmap->n;
2211ccdfe979SStefano Zampini     break;
2212ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2213ccdfe979SStefano Zampini     m = A->rmap->n;
2214ccdfe979SStefano Zampini     n = B->rmap->n;
2215ccdfe979SStefano Zampini     break;
2216ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2217ccdfe979SStefano Zampini     m = B->cmap->n;
2218ccdfe979SStefano Zampini     n = B->cmap->n;
2219ccdfe979SStefano Zampini     break;
2220ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2221ccdfe979SStefano Zampini     m = B->rmap->n;
2222ccdfe979SStefano Zampini     n = B->rmap->n;
2223ccdfe979SStefano Zampini     break;
2224ccdfe979SStefano Zampini   default:
2225e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2226ccdfe979SStefano Zampini   }
2227ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2228ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2229ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2230ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2231ccdfe979SStefano Zampini 
2232ccdfe979SStefano Zampini   /* product data */
2233ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2234ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2235afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2236afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2237ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2238afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2239ccdfe979SStefano Zampini   }
2240afb2bd1cSJunchao Zhang  #endif
2241ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2242ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2243ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2244ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2245ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2246ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2247ccdfe979SStefano Zampini     } else {
2248ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2249ccdfe979SStefano Zampini     }
2250ccdfe979SStefano Zampini   }
2251ccdfe979SStefano Zampini   C->product->data    = mmdata;
2252ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2253ccdfe979SStefano Zampini 
2254ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2255ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2256ccdfe979SStefano Zampini }
2257ccdfe979SStefano Zampini 
2258fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2259ccdfe979SStefano Zampini {
2260ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2261fcdce8c4SStefano Zampini   Mat                          A,B;
2262fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2263fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2264fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2265fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2266fcdce8c4SStefano Zampini   PetscBool                    flg;
2267ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2268fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2269fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2270fcdce8c4SStefano Zampini   MatProductType               ptype;
2271fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2272fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2273fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2274fcdce8c4SStefano Zampini #endif
2275b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2276ccdfe979SStefano Zampini 
2277ccdfe979SStefano Zampini   PetscFunctionBegin;
2278ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2279e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2280fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2281e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2282fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2283fcdce8c4SStefano Zampini   A = product->A;
2284fcdce8c4SStefano Zampini   B = product->B;
2285fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2286fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2287fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2288e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2289fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2290e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2291fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2292e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2293fcdce8c4SStefano Zampini     goto finalize;
2294fcdce8c4SStefano Zampini   }
2295fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2296fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2297e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2299e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2300fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2301fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2302fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2303fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2304fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2305e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2306e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2307e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2308fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2309fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2310fcdce8c4SStefano Zampini 
2311fcdce8c4SStefano Zampini   ptype = product->type;
2312fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2313fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2314fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2315fa046f9fSJunchao Zhang   }
2316fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2317fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2318fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2319fa046f9fSJunchao Zhang   }
2320fcdce8c4SStefano Zampini   switch (ptype) {
2321fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2322fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2323fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2324fcdce8c4SStefano Zampini     break;
2325fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2326fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2327fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2328fcdce8c4SStefano Zampini     break;
2329fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2330fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2331fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2332fcdce8c4SStefano Zampini     break;
2333fcdce8c4SStefano Zampini   default:
2334e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2335fcdce8c4SStefano Zampini   }
2336fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2337e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2338e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2339e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2340fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2341fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2342fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2343e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2344e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2345e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2346fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2347fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2348fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2349b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2350b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2351b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2352b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2353b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2354b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2355b4285af6SJunchao Zhang   #else
2356b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2357fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2358fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2359fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2360b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2361fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2362fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2363b4285af6SJunchao Zhang   #endif
2364fcdce8c4SStefano Zampini #else
2365b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2366fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2367fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2368fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2369fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2370fcdce8c4SStefano Zampini #endif
2371fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2372fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2373fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2374fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2375fcdce8c4SStefano Zampini finalize:
2376fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2377fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2378fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2379fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2380fcdce8c4SStefano Zampini   c->reallocs         = 0;
2381fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2382fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2383fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2384fcdce8c4SStefano Zampini   C->num_ass++;
2385ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2386ccdfe979SStefano Zampini }
2387fcdce8c4SStefano Zampini 
2388fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2389fcdce8c4SStefano Zampini {
2390fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2391fcdce8c4SStefano Zampini   Mat                          A,B;
2392fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2393fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2394fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2395fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2396fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2397fcdce8c4SStefano Zampini   PetscBool                    flg;
2398fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2399fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2400fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2401fcdce8c4SStefano Zampini   MatProductType               ptype;
2402fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2403fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2404fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2405fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2406fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2407fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2408fcdce8c4SStefano Zampini #else
2409fcdce8c4SStefano Zampini   int                          cnz;
2410fcdce8c4SStefano Zampini #endif
2411b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2412fcdce8c4SStefano Zampini 
2413fcdce8c4SStefano Zampini   PetscFunctionBegin;
2414fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2415e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2416fcdce8c4SStefano Zampini   A    = product->A;
2417fcdce8c4SStefano Zampini   B    = product->B;
2418fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2419e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2420fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2421e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2422fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2423fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2424fcdce8c4SStefano Zampini   /* product data */
2425fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2426fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2427fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2428fcdce8c4SStefano Zampini 
2429fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2430fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2431d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2432d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2433d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2434d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2435d60bce21SJunchao Zhang 
2436fcdce8c4SStefano Zampini   ptype = product->type;
2437fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2438fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2439fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2440fa046f9fSJunchao Zhang   }
2441fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2442fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2443fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2444fa046f9fSJunchao Zhang   }
2445fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2446fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2447fcdce8c4SStefano Zampini   switch (ptype) {
2448fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2449fcdce8c4SStefano Zampini     m = A->rmap->n;
2450fcdce8c4SStefano Zampini     n = B->cmap->n;
2451fcdce8c4SStefano Zampini     k = A->cmap->n;
2452fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2453fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2454fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2455fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2456fcdce8c4SStefano Zampini     break;
2457fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2458fcdce8c4SStefano Zampini     m = A->cmap->n;
2459fcdce8c4SStefano Zampini     n = B->cmap->n;
2460fcdce8c4SStefano Zampini     k = A->rmap->n;
24613606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2462fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2463fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2464fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2465fcdce8c4SStefano Zampini     break;
2466fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2467fcdce8c4SStefano Zampini     m = A->rmap->n;
2468fcdce8c4SStefano Zampini     n = B->rmap->n;
2469fcdce8c4SStefano Zampini     k = A->cmap->n;
24703606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2471fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2472fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2473fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2474fcdce8c4SStefano Zampini     break;
2475fcdce8c4SStefano Zampini   default:
2476e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2477fcdce8c4SStefano Zampini   }
2478fcdce8c4SStefano Zampini 
2479fcdce8c4SStefano Zampini   /* create cusparse matrix */
2480fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2481fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2482fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2483fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2484fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2485fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2486fcdce8c4SStefano Zampini 
2487fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2488fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2489fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2490fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2491fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2492fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2493fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2494fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2495fcdce8c4SStefano Zampini   } else {
2496fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2497fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2498fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2499fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2500fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2501fcdce8c4SStefano Zampini   }
2502fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2503fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2504fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2505fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2506fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2507fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2508fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2509fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2510fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2511fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2512fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2513fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2514fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2515fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2516fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2517fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2518fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2519fcdce8c4SStefano Zampini     c->nz = 0;
2520fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2521fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2522fcdce8c4SStefano Zampini     goto finalizesym;
2523fcdce8c4SStefano Zampini   }
2524fcdce8c4SStefano Zampini 
2525e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2526e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2527fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2528fcdce8c4SStefano Zampini   if (!biscompressed) {
2529fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2530fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2531fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2532fcdce8c4SStefano Zampini #endif
2533fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2534fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2535fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2536fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2537fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2538fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2539fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2540fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2541fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2542fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2543fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2544fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2545fcdce8c4SStefano Zampini     }
2546fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2547fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2548fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2549fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2550fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2551fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2552fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2553fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2554fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2555fcdce8c4SStefano Zampini     }
2556fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2557fcdce8c4SStefano Zampini #endif
2558fcdce8c4SStefano Zampini   }
2559e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2560e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2561fcdce8c4SStefano Zampini   /* precompute flops count */
2562fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2563fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2564fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2565fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2566fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2567fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2568fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2569fcdce8c4SStefano Zampini       }
2570fcdce8c4SStefano Zampini     }
2571fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2572fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2573fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2574fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2575fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2576fcdce8c4SStefano Zampini     }
2577fcdce8c4SStefano Zampini   } else { /* TODO */
2578fcdce8c4SStefano Zampini     flops = 0.;
2579fcdce8c4SStefano Zampini   }
2580fcdce8c4SStefano Zampini 
2581fcdce8c4SStefano Zampini   mmdata->flops = flops;
2582fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2583b4285af6SJunchao Zhang 
2584fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2585fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2586fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2587fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2588fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2589fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2590fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2591b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2592b4285af6SJunchao Zhang  {
2593b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2594b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2595b4285af6SJunchao Zhang   */
2596b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2597b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2598b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2599b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2600b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2601b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2602b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2603b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2604b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2605b4285af6SJunchao Zhang 
2606b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2607b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2608b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2609b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2610b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2611b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2612b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2613b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2614b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2615b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2616b4285af6SJunchao Zhang 
2617b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2618b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2619b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2620b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2621b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2622b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2623b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2624b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2625b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2626b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2627b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2628b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2629b4285af6SJunchao Zhang 
2630b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2631b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2632b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2633b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2634b4285af6SJunchao Zhang   /* allocate matrix C */
2635b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2636b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2637b4285af6SJunchao Zhang   /* update matC with the new pointers */
2638b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2639b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2640b4285af6SJunchao Zhang 
2641b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2642b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2644b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2645b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2646b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2647b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2648b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2649b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2650b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2651b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2652b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2653b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2654b4285af6SJunchao Zhang   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2655b4285af6SJunchao Zhang  }
2656*ae37ee31SJunchao Zhang  #else
2657b4285af6SJunchao Zhang   size_t bufSize2;
2658fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2659b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2660fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2661fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2662fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2663bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2664fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2665b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2666fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2667fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2668fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2669fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2670b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2671fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2672fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2673fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2674fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2675fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2676fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2677fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2678fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2679bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2680fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2681b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2682fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2683fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2684fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2685fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2686fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2687fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
268800702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2689fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2690fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2691fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2692fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2693fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2694fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2695b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2696fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2697fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2698*ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2699fcdce8c4SStefano Zampini #else
2700fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2701b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2702fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2703fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2704fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2705fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2706fcdce8c4SStefano Zampini   c->nz = cnz;
2707fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2708fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2709fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2710fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2711fcdce8c4SStefano Zampini 
2712fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2713fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2714fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2715fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2716b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2717fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2718fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2719fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2720fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2721fcdce8c4SStefano Zampini #endif
2722fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2723fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2724fcdce8c4SStefano Zampini finalizesym:
2725fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2726fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2727fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2728fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2729fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2730fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2731fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2732fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2733fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2734fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2735fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2736fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2737fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2738fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2739fcdce8c4SStefano Zampini   } else {
2740fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2741fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2742fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2743fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2744fcdce8c4SStefano Zampini   }
2745fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2746fcdce8c4SStefano Zampini     PetscInt r = 0;
2747fcdce8c4SStefano Zampini     c->i[0] = 0;
2748fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2749fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2750fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2751fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2752fcdce8c4SStefano Zampini     }
2753fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2754fcdce8c4SStefano Zampini   }
2755fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2756fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2757fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2758fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2759fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2760fcdce8c4SStefano Zampini   c->rmax = 0;
2761fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2762fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2763fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2764fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2765fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2766fcdce8c4SStefano Zampini   }
2767fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2768fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2769fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2770fcdce8c4SStefano Zampini 
2771fcdce8c4SStefano Zampini   C->nonzerostate++;
2772fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2773fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2774fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2775fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2776fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2777fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2778fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2779abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2780fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2781fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2782fcdce8c4SStefano Zampini   }
2783fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2784fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2785fcdce8c4SStefano Zampini }
2786fcdce8c4SStefano Zampini 
2787fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2788fcdce8c4SStefano Zampini 
2789fcdce8c4SStefano Zampini /* handles sparse or dense B */
2790fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2791fcdce8c4SStefano Zampini {
2792fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2793fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2794fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2795fcdce8c4SStefano Zampini 
2796fcdce8c4SStefano Zampini   PetscFunctionBegin;
2797fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2798fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2799abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2800fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2801fcdce8c4SStefano Zampini   }
2802fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2803fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2804fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2805fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2806fcdce8c4SStefano Zampini     }
2807fcdce8c4SStefano Zampini   }
280865e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
280965e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
281065e4b4d4SStefano Zampini     switch (product->type) {
281165e4b4d4SStefano Zampini     case MATPRODUCT_AB:
281265e4b4d4SStefano Zampini       if (product->api_user) {
281365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
281465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
281565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
281665e4b4d4SStefano Zampini       } else {
281765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
281865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
281965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
282065e4b4d4SStefano Zampini       }
282165e4b4d4SStefano Zampini       break;
282265e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
282365e4b4d4SStefano Zampini       if (product->api_user) {
282465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
282565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
282665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
282765e4b4d4SStefano Zampini       } else {
282865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
282965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
283065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
283165e4b4d4SStefano Zampini       }
283265e4b4d4SStefano Zampini       break;
283365e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
283465e4b4d4SStefano Zampini       if (product->api_user) {
283565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
283665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
283765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
283865e4b4d4SStefano Zampini       } else {
283965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
284065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
284165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
284265e4b4d4SStefano Zampini       }
284365e4b4d4SStefano Zampini       break;
284465e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
284565e4b4d4SStefano Zampini       if (product->api_user) {
284665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
284765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
284865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
284965e4b4d4SStefano Zampini       } else {
285065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
285165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
285265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
285365e4b4d4SStefano Zampini       }
285465e4b4d4SStefano Zampini       break;
285565e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
285665e4b4d4SStefano Zampini       if (product->api_user) {
285765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
285865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
285965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
286065e4b4d4SStefano Zampini       } else {
286165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
286265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
286365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
286465e4b4d4SStefano Zampini       }
286565e4b4d4SStefano Zampini       break;
286665e4b4d4SStefano Zampini     default:
286765e4b4d4SStefano Zampini       break;
286865e4b4d4SStefano Zampini     }
286965e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
287065e4b4d4SStefano Zampini   }
287165e4b4d4SStefano Zampini   /* dispatch */
2872fcdce8c4SStefano Zampini   if (isdense) {
2873ccdfe979SStefano Zampini     switch (product->type) {
2874ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2875ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2876ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2877ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2878ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2879fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2880fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2881fcdce8c4SStefano Zampini       } else {
2882fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2883fcdce8c4SStefano Zampini       }
2884fcdce8c4SStefano Zampini       break;
2885fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2886fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2887fcdce8c4SStefano Zampini       break;
2888ccdfe979SStefano Zampini     default:
2889ccdfe979SStefano Zampini       break;
2890ccdfe979SStefano Zampini     }
2891fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2892fcdce8c4SStefano Zampini     switch (product->type) {
2893fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2894fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2895fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2896fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2897fcdce8c4SStefano Zampini       break;
2898fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2899fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2900fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2901fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2902fcdce8c4SStefano Zampini       break;
2903fcdce8c4SStefano Zampini     default:
2904fcdce8c4SStefano Zampini       break;
2905fcdce8c4SStefano Zampini     }
2906fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2907fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2908fcdce8c4SStefano Zampini   }
2909ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2910ccdfe979SStefano Zampini }
2911ccdfe979SStefano Zampini 
29126fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29139ae82921SPaul Mullowney {
2914b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29159ae82921SPaul Mullowney 
29169ae82921SPaul Mullowney   PetscFunctionBegin;
2917e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2918e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2919e6e9a74fSStefano Zampini }
2920e6e9a74fSStefano Zampini 
2921e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2922e6e9a74fSStefano Zampini {
2923e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2924e6e9a74fSStefano Zampini 
2925e6e9a74fSStefano Zampini   PetscFunctionBegin;
2926e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2927e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2928e6e9a74fSStefano Zampini }
2929e6e9a74fSStefano Zampini 
2930e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2931e6e9a74fSStefano Zampini {
2932e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2933e6e9a74fSStefano Zampini 
2934e6e9a74fSStefano Zampini   PetscFunctionBegin;
2935e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2936e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2937e6e9a74fSStefano Zampini }
2938e6e9a74fSStefano Zampini 
2939e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940e6e9a74fSStefano Zampini {
2941e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2942e6e9a74fSStefano Zampini 
2943e6e9a74fSStefano Zampini   PetscFunctionBegin;
2944e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
29459ae82921SPaul Mullowney   PetscFunctionReturn(0);
29469ae82921SPaul Mullowney }
29479ae82921SPaul Mullowney 
29486fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2949ca45077fSPaul Mullowney {
2950b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2951ca45077fSPaul Mullowney 
2952ca45077fSPaul Mullowney   PetscFunctionBegin;
2953e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2954ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2955ca45077fSPaul Mullowney }
2956ca45077fSPaul Mullowney 
2957a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2958a0e72f99SJunchao Zhang {
2959a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2960a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2961a0e72f99SJunchao Zhang }
2962a0e72f99SJunchao Zhang 
2963afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2964e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29659ae82921SPaul Mullowney {
29669ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2967aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
29689ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2969e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2970b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
2971aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2972e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2973e6e9a74fSStefano Zampini   PetscBool                    compressed;
2974afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2975afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2976afb2bd1cSJunchao Zhang #endif
29776e111a19SKarl Rupp 
29789ae82921SPaul Mullowney   PetscFunctionBegin;
2979e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2980e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2981afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2982d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2983e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2984e6e9a74fSStefano Zampini   }
298534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
298634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2987e6e9a74fSStefano Zampini   if (!trans) {
29889ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2989e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2990e6e9a74fSStefano Zampini   } else {
29911a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2992e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2993e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2994e6e9a74fSStefano Zampini     } else {
29953606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
2996e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2997e6e9a74fSStefano Zampini     }
2998e6e9a74fSStefano Zampini   }
2999e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3000e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3001213423ffSJunchao Zhang 
3002e6e9a74fSStefano Zampini   try {
3003e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3004213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3005213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3006afb2bd1cSJunchao Zhang 
300785ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3008e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3009afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3010afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3011afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3012afb2bd1cSJunchao Zhang       */
3013e6e9a74fSStefano Zampini       xptr = xarray;
3014afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3015213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3016afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3017afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3018afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3019afb2bd1cSJunchao Zhang        */
3020afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3021afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3022afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3023afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3024afb2bd1cSJunchao Zhang       }
3025afb2bd1cSJunchao Zhang      #endif
3026e6e9a74fSStefano Zampini     } else {
3027afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3028afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3029afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3030afb2bd1cSJunchao Zhang        */
3031afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3032e6e9a74fSStefano Zampini       dptr = zarray;
3033e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3034afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3035e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3036a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3037e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3038e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3039e6e9a74fSStefano Zampini       }
3040afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3041afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3044afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3045afb2bd1cSJunchao Zhang       }
3046afb2bd1cSJunchao Zhang      #endif
3047e6e9a74fSStefano Zampini     }
30489ae82921SPaul Mullowney 
3049afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3050aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3051afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3052afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3053afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3054ee7b52eaSHong Zhang         cudaError_t cerr;
3055afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3056afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3057afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3058afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3059afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3060afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3061afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3062afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3063afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3064afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3065afb2bd1cSJunchao Zhang 
3066afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3067afb2bd1cSJunchao Zhang       } else {
3068afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3069afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3070afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3071afb2bd1cSJunchao Zhang       }
3072afb2bd1cSJunchao Zhang 
3073afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3074afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
30753606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3076afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3077afb2bd1cSJunchao Zhang                                beta,
3078afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3079afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3080afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3081afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3082afb2bd1cSJunchao Zhang      #else
30837656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3084e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3085a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3086afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3087aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3088e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
308957d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3090afb2bd1cSJunchao Zhang      #endif
3091aa372e3fSPaul Mullowney     } else {
3092213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3093afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3095afb2bd1cSJunchao Zhang        #else
3096301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3097e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3098afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3099e6e9a74fSStefano Zampini                                  xptr, beta,
310057d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3101afb2bd1cSJunchao Zhang        #endif
3102a65300a6SPaul Mullowney       }
3103aa372e3fSPaul Mullowney     }
3104958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3105aa372e3fSPaul Mullowney 
3106e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3108213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3109213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3110e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3111213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
31127656d835SStefano Zampini         }
3113213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3114c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
31157656d835SStefano Zampini       }
31167656d835SStefano Zampini 
3117213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3118213423ffSJunchao Zhang       if (compressed) {
3119e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3120a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3121a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3122a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3123a0e72f99SJunchao Zhang          */
3124a0e72f99SJunchao Zhang        #if 0
3125a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3126a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3127a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3128e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3129c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3130a0e72f99SJunchao Zhang        #else
3131a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3132a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3133a0e72f99SJunchao Zhang        #endif
3134958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3135e6e9a74fSStefano Zampini       }
3136e6e9a74fSStefano Zampini     } else {
3137e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3138e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3139e6e9a74fSStefano Zampini       }
3140e6e9a74fSStefano Zampini     }
3141e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3142213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3143213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
31449ae82921SPaul Mullowney   } catch(char *ex) {
31459ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31469ae82921SPaul Mullowney   }
3147e6e9a74fSStefano Zampini   if (yy) {
3148958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3149e6e9a74fSStefano Zampini   } else {
3150e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3151e6e9a74fSStefano Zampini   }
31529ae82921SPaul Mullowney   PetscFunctionReturn(0);
31539ae82921SPaul Mullowney }
31549ae82921SPaul Mullowney 
31556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3156ca45077fSPaul Mullowney {
3157b175d8bbSPaul Mullowney   PetscErrorCode ierr;
31586e111a19SKarl Rupp 
3159ca45077fSPaul Mullowney   PetscFunctionBegin;
3160e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3161ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3162ca45077fSPaul Mullowney }
3163ca45077fSPaul Mullowney 
31646fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31659ae82921SPaul Mullowney {
31669ae82921SPaul Mullowney   PetscErrorCode     ierr;
3167042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3168042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31693fa6b06aSMark Adams 
3170042217e8SBarry Smith   PetscFunctionBegin;
3171042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3172042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3173042217e8SBarry Smith     cudaError_t cerr;
3174042217e8SBarry Smith 
3175042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3176042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3177042217e8SBarry Smith     cusp->deviceMat = NULL;
3178042217e8SBarry Smith   }
31799ae82921SPaul Mullowney   PetscFunctionReturn(0);
31809ae82921SPaul Mullowney }
31819ae82921SPaul Mullowney 
31829ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3183e057df02SPaul Mullowney /*@
31849ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3185e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3186e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3187e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3188e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3189e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
31909ae82921SPaul Mullowney 
3191d083f849SBarry Smith    Collective
31929ae82921SPaul Mullowney 
31939ae82921SPaul Mullowney    Input Parameters:
31949ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
31959ae82921SPaul Mullowney .  m - number of rows
31969ae82921SPaul Mullowney .  n - number of columns
31979ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
31989ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
31990298fd71SBarry Smith          (possibly different for each row) or NULL
32009ae82921SPaul Mullowney 
32019ae82921SPaul Mullowney    Output Parameter:
32029ae82921SPaul Mullowney .  A - the matrix
32039ae82921SPaul Mullowney 
32049ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32059ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32069ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32079ae82921SPaul Mullowney 
32089ae82921SPaul Mullowney    Notes:
32099ae82921SPaul Mullowney    If nnz is given then nz is ignored
32109ae82921SPaul Mullowney 
32119ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32129ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32139ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32149ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32159ae82921SPaul Mullowney 
32169ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32170298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32189ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32199ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32209ae82921SPaul Mullowney 
32219ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32229ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32239ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32249ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32259ae82921SPaul Mullowney 
32269ae82921SPaul Mullowney    Level: intermediate
32279ae82921SPaul Mullowney 
3228e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
32299ae82921SPaul Mullowney @*/
32309ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32319ae82921SPaul Mullowney {
32329ae82921SPaul Mullowney   PetscErrorCode ierr;
32339ae82921SPaul Mullowney 
32349ae82921SPaul Mullowney   PetscFunctionBegin;
32359ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
32369ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
32379ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
32389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
32399ae82921SPaul Mullowney   PetscFunctionReturn(0);
32409ae82921SPaul Mullowney }
32419ae82921SPaul Mullowney 
32426fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32439ae82921SPaul Mullowney {
32449ae82921SPaul Mullowney   PetscErrorCode ierr;
3245ab25e6cbSDominic Meiser 
32469ae82921SPaul Mullowney   PetscFunctionBegin;
32479ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3248470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
32499ae82921SPaul Mullowney   } else {
3250470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3251aa372e3fSPaul Mullowney   }
3252c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3253ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3254ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3255ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3256fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3257ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
32587e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
32597e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3260ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
32619ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
32629ae82921SPaul Mullowney   PetscFunctionReturn(0);
32639ae82921SPaul Mullowney }
32649ae82921SPaul Mullowney 
3265ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
326695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32679ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32689ff858a8SKarl Rupp {
32699ff858a8SKarl Rupp   PetscErrorCode ierr;
32709ff858a8SKarl Rupp 
32719ff858a8SKarl Rupp   PetscFunctionBegin;
32729ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3273ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
32749ff858a8SKarl Rupp   PetscFunctionReturn(0);
32759ff858a8SKarl Rupp }
32769ff858a8SKarl Rupp 
3277039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
327895639643SRichard Tran Mills {
3279e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3280a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3281039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3282039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3283039c6fbaSStefano Zampini   PetscScalar        *ay;
3284039c6fbaSStefano Zampini   const PetscScalar  *ax;
3285039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3286e6e9a74fSStefano Zampini 
328795639643SRichard Tran Mills   PetscFunctionBegin;
3288a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3289a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3290039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3291a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3292a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3293a587d139SMark     PetscFunctionReturn(0);
329495639643SRichard Tran Mills   }
3295039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3296a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3297a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3298e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3299e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3300039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3301039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3302039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3303039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3304039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3305039c6fbaSStefano Zampini     if (eq) {
3306039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3307039c6fbaSStefano Zampini     }
3308039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3309039c6fbaSStefano Zampini   }
3310d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3311d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3312039c6fbaSStefano Zampini 
3313039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3314039c6fbaSStefano Zampini     cusparseStatus_t stat;
3315039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3316039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3317039c6fbaSStefano Zampini     size_t           bufferSize;
3318039c6fbaSStefano Zampini     void             *buffer;
3319ee7b52eaSHong Zhang     cudaError_t      cerr;
3320039c6fbaSStefano Zampini #endif
3321039c6fbaSStefano Zampini 
3322039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3323039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3324039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3325039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3327039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3328039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3329039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3330039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3331039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3332039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3333039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3334039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3335039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3336039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3337039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3338039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3339039c6fbaSStefano Zampini #else
3340039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3341039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3342039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3343039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3344039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3345039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3346039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3347039c6fbaSStefano Zampini #endif
3348039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3349039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3350039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3351039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3352039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3353a587d139SMark     cublasHandle_t cublasv2handle;
3354039c6fbaSStefano Zampini     cublasStatus_t berr;
3355a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3356039c6fbaSStefano Zampini 
3357039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3358039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3359a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3360a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3361a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3362039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3363a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3364a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3365039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3366039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3367a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3368039c6fbaSStefano Zampini   } else {
3369a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3370d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3371a587d139SMark   }
337295639643SRichard Tran Mills   PetscFunctionReturn(0);
337395639643SRichard Tran Mills }
337495639643SRichard Tran Mills 
337533c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
337633c9ba73SStefano Zampini {
337733c9ba73SStefano Zampini   PetscErrorCode ierr;
337833c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
337933c9ba73SStefano Zampini   PetscScalar    *ay;
338033c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
338133c9ba73SStefano Zampini   cublasStatus_t berr;
338233c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
338333c9ba73SStefano Zampini 
338433c9ba73SStefano Zampini   PetscFunctionBegin;
338533c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
338633c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
338733c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
338833c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
338933c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
339033c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
339133c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
339233c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
339333c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
339433c9ba73SStefano Zampini   PetscFunctionReturn(0);
339533c9ba73SStefano Zampini }
339633c9ba73SStefano Zampini 
33973fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
33983fa6b06aSMark Adams {
33993fa6b06aSMark Adams   PetscErrorCode ierr;
34007e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3401a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34027e8381f9SStefano Zampini 
34033fa6b06aSMark Adams   PetscFunctionBegin;
34043fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34053fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34067e8381f9SStefano Zampini     if (spptr->mat) {
34077e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34087e8381f9SStefano Zampini       if (matrix->values) {
34097e8381f9SStefano Zampini         both = PETSC_TRUE;
34107e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34117e8381f9SStefano Zampini       }
34127e8381f9SStefano Zampini     }
34137e8381f9SStefano Zampini     if (spptr->matTranspose) {
34147e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34157e8381f9SStefano Zampini       if (matrix->values) {
34167e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34177e8381f9SStefano Zampini       }
34187e8381f9SStefano Zampini     }
34193fa6b06aSMark Adams   }
3420a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3421a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3422a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
34237e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3424a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34253fa6b06aSMark Adams   PetscFunctionReturn(0);
34263fa6b06aSMark Adams }
34273fa6b06aSMark Adams 
3428a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3429a587d139SMark {
3430a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3431a587d139SMark   PetscErrorCode ierr;
3432a587d139SMark 
3433a587d139SMark   PetscFunctionBegin;
3434a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3435a587d139SMark   if (flg) {
3436a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3437a587d139SMark 
343833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3439a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3440a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3441a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3442a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3443a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3444a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3445a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3446a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3447fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3448c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3449a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3450a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3451a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3452a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3453a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3454fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3455a587d139SMark   } else {
345633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3457a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3458a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3459a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3460a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3461a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3462a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3463a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3464a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3465fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3466c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3467a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3468a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3469a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3470a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3471a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3472fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3473a587d139SMark   }
3474a587d139SMark   A->boundtocpu = flg;
3475a587d139SMark   a->inode.use = flg;
3476a587d139SMark   PetscFunctionReturn(0);
3477a587d139SMark }
3478a587d139SMark 
347949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
34809ae82921SPaul Mullowney {
34819ae82921SPaul Mullowney   PetscErrorCode   ierr;
3482aa372e3fSPaul Mullowney   cusparseStatus_t stat;
348349735bf3SStefano Zampini   Mat              B;
34849ae82921SPaul Mullowney 
34859ae82921SPaul Mullowney   PetscFunctionBegin;
3486832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
348749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
348849735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
348949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
349049735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
349149735bf3SStefano Zampini   }
349249735bf3SStefano Zampini   B = *newmat;
349349735bf3SStefano Zampini 
349434136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
349534136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
349634136279SStefano Zampini 
349749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
34989ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3499e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3500e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3501e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3502a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
35031a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3504d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3505a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3506a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3507a435da06SStefano Zampini      #else
3508d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3509a435da06SStefano Zampini      #endif
3510d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3511d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3512d8132acaSStefano Zampini      #endif
35131a2c6b5cSJunchao Zhang       B->spptr = spptr;
35149ae82921SPaul Mullowney     } else {
3515e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3516e6e9a74fSStefano Zampini 
3517e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3518e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3519a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3520e6e9a74fSStefano Zampini       B->spptr = spptr;
35219ae82921SPaul Mullowney     }
3522e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
352349735bf3SStefano Zampini   }
3524693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35259ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35261a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35279ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
352895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3529693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35302205254eSKarl Rupp 
3531e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
35329ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3533bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3534ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3535ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3536ae48a8d0SStefano Zampini #endif
35379ae82921SPaul Mullowney   PetscFunctionReturn(0);
35389ae82921SPaul Mullowney }
35399ae82921SPaul Mullowney 
354002fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
354102fe1965SBarry Smith {
354202fe1965SBarry Smith   PetscErrorCode ierr;
354302fe1965SBarry Smith 
354402fe1965SBarry Smith   PetscFunctionBegin;
354502fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
35460ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
354702fe1965SBarry Smith   PetscFunctionReturn(0);
354802fe1965SBarry Smith }
354902fe1965SBarry Smith 
35503ca39a21SBarry Smith /*MC
3551e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3552e057df02SPaul Mullowney 
3553e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35542692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35552692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3556e057df02SPaul Mullowney 
3557e057df02SPaul Mullowney    Options Database Keys:
3558e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3559aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3560a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3561e057df02SPaul Mullowney 
3562e057df02SPaul Mullowney   Level: beginner
3563e057df02SPaul Mullowney 
35648468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3565e057df02SPaul Mullowney M*/
35667f756511SDominic Meiser 
3567bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35680f39cd5aSBarry Smith 
35693ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
357042c9c57cSBarry Smith {
357142c9c57cSBarry Smith   PetscErrorCode ierr;
357242c9c57cSBarry Smith 
357342c9c57cSBarry Smith   PetscFunctionBegin;
3574bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
35753ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35763ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35773ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
35783ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3579bddcd29dSMark Adams 
358042c9c57cSBarry Smith   PetscFunctionReturn(0);
358142c9c57cSBarry Smith }
358229b38603SBarry Smith 
3583470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
35847f756511SDominic Meiser {
3585e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
35867f756511SDominic Meiser   cusparseStatus_t stat;
35877f756511SDominic Meiser 
35887f756511SDominic Meiser   PetscFunctionBegin;
35897f756511SDominic Meiser   if (*cusparsestruct) {
3590e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3591e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
35927f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
359381902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
35947e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
35957e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3596a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
35977e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3598e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
35997f756511SDominic Meiser   }
36007f756511SDominic Meiser   PetscFunctionReturn(0);
36017f756511SDominic Meiser }
36027f756511SDominic Meiser 
36037f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36047f756511SDominic Meiser {
36057f756511SDominic Meiser   PetscFunctionBegin;
36067f756511SDominic Meiser   if (*mat) {
36077f756511SDominic Meiser     delete (*mat)->values;
36087f756511SDominic Meiser     delete (*mat)->column_indices;
36097f756511SDominic Meiser     delete (*mat)->row_offsets;
36107f756511SDominic Meiser     delete *mat;
36117f756511SDominic Meiser     *mat = 0;
36127f756511SDominic Meiser   }
36137f756511SDominic Meiser   PetscFunctionReturn(0);
36147f756511SDominic Meiser }
36157f756511SDominic Meiser 
3616470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36177f756511SDominic Meiser {
36187f756511SDominic Meiser   cusparseStatus_t stat;
36197f756511SDominic Meiser   PetscErrorCode   ierr;
36207f756511SDominic Meiser 
36217f756511SDominic Meiser   PetscFunctionBegin;
36227f756511SDominic Meiser   if (*trifactor) {
362357d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3624afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
36257f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
36261b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
36272cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3628afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36291b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3630afb2bd1cSJunchao Zhang    #endif
3631da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
36327f756511SDominic Meiser   }
36337f756511SDominic Meiser   PetscFunctionReturn(0);
36347f756511SDominic Meiser }
36357f756511SDominic Meiser 
3636470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36377f756511SDominic Meiser {
36387f756511SDominic Meiser   CsrMatrix        *mat;
36397f756511SDominic Meiser   cusparseStatus_t stat;
36407f756511SDominic Meiser   cudaError_t      err;
36417f756511SDominic Meiser 
36427f756511SDominic Meiser   PetscFunctionBegin;
36437f756511SDominic Meiser   if (*matstruct) {
36447f756511SDominic Meiser     if ((*matstruct)->mat) {
36457f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3646afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3647afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3648afb2bd1cSJunchao Zhang        #else
36497f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
365057d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3651afb2bd1cSJunchao Zhang        #endif
36527f756511SDominic Meiser       } else {
36537f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36547f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36557f756511SDominic Meiser       }
36567f756511SDominic Meiser     }
365757d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
36587f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3659afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
36607656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
36617656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3662afb2bd1cSJunchao Zhang 
3663afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3664afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3665afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3666afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3667afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3668afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3669afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3670afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3671afb2bd1cSJunchao Zhang       }
3672afb2bd1cSJunchao Zhang     }
3673afb2bd1cSJunchao Zhang    #endif
36747f756511SDominic Meiser     delete *matstruct;
36757e8381f9SStefano Zampini     *matstruct = NULL;
36767f756511SDominic Meiser   }
36777f756511SDominic Meiser   PetscFunctionReturn(0);
36787f756511SDominic Meiser }
36797f756511SDominic Meiser 
3680e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
36817f756511SDominic Meiser {
3682e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3683e6e9a74fSStefano Zampini 
36847f756511SDominic Meiser   PetscFunctionBegin;
36857f756511SDominic Meiser   if (*trifactors) {
3686e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3687e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3688e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3689e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
36907f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
36917f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
36927f756511SDominic Meiser     delete (*trifactors)->workVector;
36937e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
36947e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
36957e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3696bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3697bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3698e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3699ccdfe979SStefano Zampini   }
3700ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3701ccdfe979SStefano Zampini }
3702ccdfe979SStefano Zampini 
3703ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3704ccdfe979SStefano Zampini {
3705e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3706ccdfe979SStefano Zampini   cusparseHandle_t handle;
3707ccdfe979SStefano Zampini   cusparseStatus_t stat;
3708ccdfe979SStefano Zampini 
3709ccdfe979SStefano Zampini   PetscFunctionBegin;
3710ccdfe979SStefano Zampini   if (*trifactors) {
3711e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
37127f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
371357d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
37147f756511SDominic Meiser     }
3715e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
37167f756511SDominic Meiser   }
37177f756511SDominic Meiser   PetscFunctionReturn(0);
37187f756511SDominic Meiser }
37197e8381f9SStefano Zampini 
37207e8381f9SStefano Zampini struct IJCompare
37217e8381f9SStefano Zampini {
37227e8381f9SStefano Zampini   __host__ __device__
37237e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37247e8381f9SStefano Zampini   {
37257e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37267e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37277e8381f9SStefano Zampini     return false;
37287e8381f9SStefano Zampini   }
37297e8381f9SStefano Zampini };
37307e8381f9SStefano Zampini 
37317e8381f9SStefano Zampini struct IJEqual
37327e8381f9SStefano Zampini {
37337e8381f9SStefano Zampini   __host__ __device__
37347e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37357e8381f9SStefano Zampini   {
37367e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37377e8381f9SStefano Zampini     return true;
37387e8381f9SStefano Zampini   }
37397e8381f9SStefano Zampini };
37407e8381f9SStefano Zampini 
37417e8381f9SStefano Zampini struct IJDiff
37427e8381f9SStefano Zampini {
37437e8381f9SStefano Zampini   __host__ __device__
37447e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37457e8381f9SStefano Zampini   {
37467e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37477e8381f9SStefano Zampini   }
37487e8381f9SStefano Zampini };
37497e8381f9SStefano Zampini 
37507e8381f9SStefano Zampini struct IJSum
37517e8381f9SStefano Zampini {
37527e8381f9SStefano Zampini   __host__ __device__
37537e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37547e8381f9SStefano Zampini   {
37557e8381f9SStefano Zampini     return t1||t2;
37567e8381f9SStefano Zampini   }
37577e8381f9SStefano Zampini };
37587e8381f9SStefano Zampini 
37597e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3760e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
37617e8381f9SStefano Zampini {
37627e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3763fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3764bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
376508391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37667e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37677e8381f9SStefano Zampini   PetscErrorCode                        ierr;
37687e8381f9SStefano Zampini   PetscInt                              n;
37697e8381f9SStefano Zampini 
37707e8381f9SStefano Zampini   PetscFunctionBegin;
37717e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
37727e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
37737e8381f9SStefano Zampini   if (!cusp->cooPerm) {
37747e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
37757e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
37767e8381f9SStefano Zampini     PetscFunctionReturn(0);
37777e8381f9SStefano Zampini   }
37787e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
37797e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780e61fc153SStefano Zampini   if (!v) {
3781e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3782e61fc153SStefano Zampini     goto finalize;
37837e8381f9SStefano Zampini   }
3784e61fc153SStefano Zampini   n = cusp->cooPerm->size();
378508391a17SStefano Zampini   if (isCudaMem(v)) {
378608391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
378708391a17SStefano Zampini   } else {
3788e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3789e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
379008391a17SStefano Zampini     d_v = cooPerm_v->data();
3791e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
379208391a17SStefano Zampini   }
3793bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3794e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3795ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3796bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
379708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3798ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3799ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3800ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3801ddea5d60SJunchao Zhang       */
3802e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3803e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3804e61fc153SStefano Zampini       delete cooPerm_w;
38057e8381f9SStefano Zampini     } else {
3806ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
380708391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38087e8381f9SStefano Zampini                                                                 matrix->values->begin()));
380908391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38107e8381f9SStefano Zampini                                                                 matrix->values->end()));
3811ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38127e8381f9SStefano Zampini     }
38137e8381f9SStefano Zampini   } else {
3814e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
381508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3816e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38177e8381f9SStefano Zampini     } else {
381808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38197e8381f9SStefano Zampini                                                                 matrix->values->begin()));
382008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38217e8381f9SStefano Zampini                                                                 matrix->values->end()));
38227e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38237e8381f9SStefano Zampini     }
38247e8381f9SStefano Zampini   }
3825bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3826e61fc153SStefano Zampini finalize:
3827e61fc153SStefano Zampini   delete cooPerm_v;
38287e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3829e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3830fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3831fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3832fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3833fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3834fcdce8c4SStefano Zampini   a->reallocs         = 0;
3835fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3836fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3837fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3838fcdce8c4SStefano Zampini   A->num_ass++;
38397e8381f9SStefano Zampini   PetscFunctionReturn(0);
38407e8381f9SStefano Zampini }
38417e8381f9SStefano Zampini 
3842a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3843a49f1ed0SStefano Zampini {
3844a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3845a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3846a49f1ed0SStefano Zampini 
3847a49f1ed0SStefano Zampini   PetscFunctionBegin;
3848a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3849a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3850a49f1ed0SStefano Zampini   if (destroy) {
3851a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3852a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3853a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3854a49f1ed0SStefano Zampini   }
38551a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3856a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3857a49f1ed0SStefano Zampini }
3858a49f1ed0SStefano Zampini 
38597e8381f9SStefano Zampini #include <thrust/binary_search.h>
3860e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
38617e8381f9SStefano Zampini {
38627e8381f9SStefano Zampini   PetscErrorCode     ierr;
38637e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38647e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38657e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38667e8381f9SStefano Zampini   cudaError_t        cerr;
38677e8381f9SStefano Zampini 
38687e8381f9SStefano Zampini   PetscFunctionBegin;
38697e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
38707e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
38717e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
38727e8381f9SStefano Zampini   if (n != cooPerm_n) {
38737e8381f9SStefano Zampini     delete cusp->cooPerm;
38747e8381f9SStefano Zampini     delete cusp->cooPerm_a;
38757e8381f9SStefano Zampini     cusp->cooPerm = NULL;
38767e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
38777e8381f9SStefano Zampini   }
38787e8381f9SStefano Zampini   if (n) {
38797e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
38807e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
38817e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
38827e8381f9SStefano Zampini 
38837e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
38847e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
38857e8381f9SStefano Zampini 
38867e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
38877e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
38887e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3889ddea5d60SJunchao Zhang 
3890ddea5d60SJunchao Zhang     /* Ex.
3891ddea5d60SJunchao Zhang       n = 6
3892ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3893ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3894ddea5d60SJunchao Zhang     */
38957e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
38967e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
38977e8381f9SStefano Zampini 
389808391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
38997e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3900ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3901ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39027e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39037e8381f9SStefano Zampini 
3904ddea5d60SJunchao Zhang     /*
3905ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3906ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3907ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3908ddea5d60SJunchao Zhang     */
3909ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3910ddea5d60SJunchao Zhang 
3911ddea5d60SJunchao Zhang     /*
3912ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3913ddea5d60SJunchao Zhang                             ^ekey
3914ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3915ddea5d60SJunchao Zhang                            ^nekye
3916ddea5d60SJunchao Zhang     */
39177e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39187e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39197e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3920ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3921ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3922ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3923ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3924ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39257e8381f9SStefano Zampini       w[0] = 0;
3926ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3927ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39287e8381f9SStefano Zampini     }
39297e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3930ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3931ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3932ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
393308391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
39347e8381f9SStefano Zampini 
39357e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
39367e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39377e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39387e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39397e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
3940ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
39417e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
39427e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3943fcdce8c4SStefano Zampini     a->rmax = 0;
39447e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
39457e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
39467e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
39477e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
39487e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
39497e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39507e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39517e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39527e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3953fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39547e8381f9SStefano Zampini     }
3955fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39567e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39577e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3958fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
39597e8381f9SStefano Zampini   } else {
39607e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
39617e8381f9SStefano Zampini   }
3962e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
39637e8381f9SStefano Zampini 
39647e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3965e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3966e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
39677e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
39687e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39697e8381f9SStefano Zampini   A->nonzerostate++;
39707e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3971a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
39727e8381f9SStefano Zampini 
39737e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
39747e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
39757e8381f9SStefano Zampini   PetscFunctionReturn(0);
39767e8381f9SStefano Zampini }
3977ed502f03SStefano Zampini 
39785b7e41feSStefano Zampini /*@C
39795b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
39805b7e41feSStefano Zampini 
39815b7e41feSStefano Zampini    Not collective
39825b7e41feSStefano Zampini 
39835b7e41feSStefano Zampini     Input Parameters:
39845b7e41feSStefano Zampini +   A - the matrix
39855b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
39865b7e41feSStefano Zampini 
39875b7e41feSStefano Zampini     Output Parameters:
39885b7e41feSStefano Zampini +   ia - the CSR row pointers
39895b7e41feSStefano Zampini -   ja - the CSR column indices
39905b7e41feSStefano Zampini 
39915b7e41feSStefano Zampini     Level: developer
39925b7e41feSStefano Zampini 
39935b7e41feSStefano Zampini     Notes:
39945b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
39955b7e41feSStefano Zampini 
39965b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
39975b7e41feSStefano Zampini @*/
39985f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
39995f101d05SStefano Zampini {
40005f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
40015f101d05SStefano Zampini   CsrMatrix          *csr;
40025f101d05SStefano Zampini   PetscErrorCode     ierr;
40035f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
40045f101d05SStefano Zampini 
40055f101d05SStefano Zampini   PetscFunctionBegin;
40065f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
40075f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
40085f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
40095f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
40105f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
40115f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
40125f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
40135f101d05SStefano Zampini   if (i) {
40145f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
40155f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
40165f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
40175f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
40185f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
40195f101d05SStefano Zampini       }
40205f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
40215f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
40225f101d05SStefano Zampini   }
40235f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
40245f101d05SStefano Zampini   PetscFunctionReturn(0);
40255f101d05SStefano Zampini }
40265f101d05SStefano Zampini 
40275b7e41feSStefano Zampini /*@C
40285b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
40295b7e41feSStefano Zampini 
40305b7e41feSStefano Zampini    Not collective
40315b7e41feSStefano Zampini 
40325b7e41feSStefano Zampini     Input Parameters:
40335b7e41feSStefano Zampini +   A - the matrix
40345b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40355b7e41feSStefano Zampini 
40365b7e41feSStefano Zampini     Output Parameters:
40375b7e41feSStefano Zampini +   ia - the CSR row pointers
40385b7e41feSStefano Zampini -   ja - the CSR column indices
40395b7e41feSStefano Zampini 
40405b7e41feSStefano Zampini     Level: developer
40415b7e41feSStefano Zampini 
40425b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
40435b7e41feSStefano Zampini @*/
40445f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
40455f101d05SStefano Zampini {
40465f101d05SStefano Zampini   PetscFunctionBegin;
40475f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
40485f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
40495f101d05SStefano Zampini   if (i) *i = NULL;
40505f101d05SStefano Zampini   if (j) *j = NULL;
40515f101d05SStefano Zampini   PetscFunctionReturn(0);
40525f101d05SStefano Zampini }
40535f101d05SStefano Zampini 
40545b7e41feSStefano Zampini /*@C
40555b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
40565b7e41feSStefano Zampini 
40575b7e41feSStefano Zampini    Not Collective
40585b7e41feSStefano Zampini 
40595b7e41feSStefano Zampini    Input Parameter:
40605b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
40615b7e41feSStefano Zampini 
40625b7e41feSStefano Zampini    Output Parameter:
40635b7e41feSStefano Zampini .   a - pointer to the device data
40645b7e41feSStefano Zampini 
40655b7e41feSStefano Zampini    Level: developer
40665b7e41feSStefano Zampini 
40675b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
40685b7e41feSStefano Zampini 
40695b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
40705b7e41feSStefano Zampini @*/
4071ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4072ed502f03SStefano Zampini {
4073ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4074ed502f03SStefano Zampini   CsrMatrix          *csr;
4075ed502f03SStefano Zampini   PetscErrorCode     ierr;
4076ed502f03SStefano Zampini 
4077ed502f03SStefano Zampini   PetscFunctionBegin;
4078ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4079ed502f03SStefano Zampini   PetscValidPointer(a,2);
4080ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4081ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4082ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
408333c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4084ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4085ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4086ed502f03SStefano Zampini   *a = csr->values->data().get();
4087ed502f03SStefano Zampini   PetscFunctionReturn(0);
4088ed502f03SStefano Zampini }
4089ed502f03SStefano Zampini 
40905b7e41feSStefano Zampini /*@C
40915b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
40925b7e41feSStefano Zampini 
40935b7e41feSStefano Zampini    Not Collective
40945b7e41feSStefano Zampini 
40955b7e41feSStefano Zampini    Input Parameter:
40965b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
40975b7e41feSStefano Zampini 
40985b7e41feSStefano Zampini    Output Parameter:
40995b7e41feSStefano Zampini .   a - pointer to the device data
41005b7e41feSStefano Zampini 
41015b7e41feSStefano Zampini    Level: developer
41025b7e41feSStefano Zampini 
41035b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
41045b7e41feSStefano Zampini @*/
4105ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4106ed502f03SStefano Zampini {
4107ed502f03SStefano Zampini   PetscFunctionBegin;
4108ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4109ed502f03SStefano Zampini   PetscValidPointer(a,2);
4110ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4111ed502f03SStefano Zampini   *a = NULL;
4112ed502f03SStefano Zampini   PetscFunctionReturn(0);
4113ed502f03SStefano Zampini }
4114ed502f03SStefano Zampini 
41155b7e41feSStefano Zampini /*@C
41165b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41175b7e41feSStefano Zampini 
41185b7e41feSStefano Zampini    Not Collective
41195b7e41feSStefano Zampini 
41205b7e41feSStefano Zampini    Input Parameter:
41215b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41225b7e41feSStefano Zampini 
41235b7e41feSStefano Zampini    Output Parameter:
41245b7e41feSStefano Zampini .   a - pointer to the device data
41255b7e41feSStefano Zampini 
41265b7e41feSStefano Zampini    Level: developer
41275b7e41feSStefano Zampini 
41285b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41295b7e41feSStefano Zampini 
41305b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
41315b7e41feSStefano Zampini @*/
4132039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4133039c6fbaSStefano Zampini {
4134039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4135039c6fbaSStefano Zampini   CsrMatrix          *csr;
4136039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4137039c6fbaSStefano Zampini 
4138039c6fbaSStefano Zampini   PetscFunctionBegin;
4139039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4140039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4141039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4142039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4143039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
414433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4145039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4146039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4147039c6fbaSStefano Zampini   *a = csr->values->data().get();
4148039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4149a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4150039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4151039c6fbaSStefano Zampini }
41525b7e41feSStefano Zampini /*@C
41535b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4154039c6fbaSStefano Zampini 
41555b7e41feSStefano Zampini    Not Collective
41565b7e41feSStefano Zampini 
41575b7e41feSStefano Zampini    Input Parameter:
41585b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41595b7e41feSStefano Zampini 
41605b7e41feSStefano Zampini    Output Parameter:
41615b7e41feSStefano Zampini .   a - pointer to the device data
41625b7e41feSStefano Zampini 
41635b7e41feSStefano Zampini    Level: developer
41645b7e41feSStefano Zampini 
41655b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
41665b7e41feSStefano Zampini @*/
4167039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4168039c6fbaSStefano Zampini {
4169039c6fbaSStefano Zampini   PetscErrorCode ierr;
4170039c6fbaSStefano Zampini 
4171039c6fbaSStefano Zampini   PetscFunctionBegin;
4172039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4173039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4174039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4175039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4176039c6fbaSStefano Zampini   *a = NULL;
4177039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4178039c6fbaSStefano Zampini }
4179039c6fbaSStefano Zampini 
41805b7e41feSStefano Zampini /*@C
41815b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41825b7e41feSStefano Zampini 
41835b7e41feSStefano Zampini    Not Collective
41845b7e41feSStefano Zampini 
41855b7e41feSStefano Zampini    Input Parameter:
41865b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41875b7e41feSStefano Zampini 
41885b7e41feSStefano Zampini    Output Parameter:
41895b7e41feSStefano Zampini .   a - pointer to the device data
41905b7e41feSStefano Zampini 
41915b7e41feSStefano Zampini    Level: developer
41925b7e41feSStefano Zampini 
41935b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
41945b7e41feSStefano Zampini 
41955b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
41965b7e41feSStefano Zampini @*/
4197ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4198ed502f03SStefano Zampini {
4199ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4200ed502f03SStefano Zampini   CsrMatrix          *csr;
4201a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4202ed502f03SStefano Zampini 
4203ed502f03SStefano Zampini   PetscFunctionBegin;
4204ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4205ed502f03SStefano Zampini   PetscValidPointer(a,2);
4206ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4207ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
420833c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4209ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4210ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4211ed502f03SStefano Zampini   *a = csr->values->data().get();
4212039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4213a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4214ed502f03SStefano Zampini   PetscFunctionReturn(0);
4215ed502f03SStefano Zampini }
4216ed502f03SStefano Zampini 
42175b7e41feSStefano Zampini /*@C
42185b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
42195b7e41feSStefano Zampini 
42205b7e41feSStefano Zampini    Not Collective
42215b7e41feSStefano Zampini 
42225b7e41feSStefano Zampini    Input Parameter:
42235b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42245b7e41feSStefano Zampini 
42255b7e41feSStefano Zampini    Output Parameter:
42265b7e41feSStefano Zampini .   a - pointer to the device data
42275b7e41feSStefano Zampini 
42285b7e41feSStefano Zampini    Level: developer
42295b7e41feSStefano Zampini 
42305b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
42315b7e41feSStefano Zampini @*/
4232ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4233ed502f03SStefano Zampini {
4234ed502f03SStefano Zampini   PetscErrorCode ierr;
4235ed502f03SStefano Zampini 
4236ed502f03SStefano Zampini   PetscFunctionBegin;
4237ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4238ed502f03SStefano Zampini   PetscValidPointer(a,2);
4239ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4240ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4241ed502f03SStefano Zampini   *a = NULL;
4242ed502f03SStefano Zampini   PetscFunctionReturn(0);
4243ed502f03SStefano Zampini }
4244ed502f03SStefano Zampini 
4245ed502f03SStefano Zampini struct IJCompare4
4246ed502f03SStefano Zampini {
4247ed502f03SStefano Zampini   __host__ __device__
42482ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4249ed502f03SStefano Zampini   {
4250ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4251ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4252ed502f03SStefano Zampini     return false;
4253ed502f03SStefano Zampini   }
4254ed502f03SStefano Zampini };
4255ed502f03SStefano Zampini 
42568909a122SStefano Zampini struct Shift
42578909a122SStefano Zampini {
4258ed502f03SStefano Zampini   int _shift;
4259ed502f03SStefano Zampini 
4260ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4261ed502f03SStefano Zampini   __host__ __device__
4262ed502f03SStefano Zampini   inline int operator() (const int &c)
4263ed502f03SStefano Zampini   {
4264ed502f03SStefano Zampini     return c + _shift;
4265ed502f03SStefano Zampini   }
4266ed502f03SStefano Zampini };
4267ed502f03SStefano Zampini 
4268ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4269ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4270ed502f03SStefano Zampini {
4271ed502f03SStefano Zampini   PetscErrorCode               ierr;
4272ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4273ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4274ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4275ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4276ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4277ed502f03SStefano Zampini   cusparseStatus_t             stat;
4278ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4279ed502f03SStefano Zampini   cudaError_t                  cerr;
4280ed502f03SStefano Zampini 
4281ed502f03SStefano Zampini   PetscFunctionBegin;
4282ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4283ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4284ed502f03SStefano Zampini   PetscValidPointer(C,4);
4285ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4286ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4287ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
4288ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4289ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4290ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4291ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4292ed502f03SStefano Zampini     m     = A->rmap->n;
4293ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4294ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4295ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4296ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4297ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4298ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4299ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4300ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4301ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4302ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4303ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4304ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4305ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4306ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4307ed502f03SStefano Zampini     Ccusp->nrows    = m;
4308ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4309ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4310ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4311ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4312ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4313ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4314ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4315ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4316ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4317ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4318ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4319ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4320ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4321ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4322ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4323ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4324ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4325ed502f03SStefano Zampini 
4326ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4327ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4328ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4329ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4330ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4331ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4332ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4333ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4334ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4335ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4336ed502f03SStefano Zampini     if (c->nz) {
43372ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
43382ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
43392ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
43402ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
43412ed87e7eSStefano Zampini 
4342ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4343ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4344ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4345ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4346ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4347ed502f03SStefano Zampini         }
43482ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
43492ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4350ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4351ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4352ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4353ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4354ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4355ed502f03SStefano Zampini         }
43562ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
43572ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4358ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
43592ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
43602ed87e7eSStefano Zampini                               Aroff->data().get(),
43612ed87e7eSStefano Zampini                               Annz,
43622ed87e7eSStefano Zampini                               m,
43632ed87e7eSStefano Zampini                               Acoo->data().get(),
43642ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4365ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
43662ed87e7eSStefano Zampini                               Broff->data().get(),
4367ed502f03SStefano Zampini                               Bnnz,
4368ed502f03SStefano Zampini                               m,
43692ed87e7eSStefano Zampini                               Bcoo->data().get(),
4370ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
43712ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
43722ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
43732ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
43748909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4375ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4376ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
43778909a122SStefano Zampini #else
43788909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
43798909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
43808909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
43818909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
43828909a122SStefano Zampini #endif
43832ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
43842ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
43852ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
43862ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
43872ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
43882ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4389ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4390ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4391ed502f03SStefano Zampini       thrust::advance(p2,Annz);
43922ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
43938909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
43948909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
43958909a122SStefano Zampini #endif
43962ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
43972ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
43982ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
43992ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44002ed87e7eSStefano Zampini #else
44012ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
44022ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
44032ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
44042ed87e7eSStefano Zampini #endif
4405ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
44062ed87e7eSStefano Zampini                               Ccoo->data().get(),
4407ed502f03SStefano Zampini                               c->nz,
4408ed502f03SStefano Zampini                               m,
4409ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4410ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4411ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
44122ed87e7eSStefano Zampini       delete wPerm;
44132ed87e7eSStefano Zampini       delete Acoo;
44142ed87e7eSStefano Zampini       delete Bcoo;
44152ed87e7eSStefano Zampini       delete Ccoo;
4416ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4417ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4418ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4419ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4420ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4421ed502f03SStefano Zampini #endif
44221a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
44233606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
44243606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4425ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4426ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4427ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4428ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4429ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4430ed502f03SStefano Zampini 
44311a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
44321a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4433a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4434ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4435ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4436ed502f03SStefano Zampini         CcsrT->num_rows = n;
4437ed502f03SStefano Zampini         CcsrT->num_cols = m;
4438ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4439ed502f03SStefano Zampini 
4440ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4441ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4442ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4443ed502f03SStefano Zampini 
4444ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4445ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4446ed502f03SStefano Zampini         if (AT) {
4447ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4448ed502f03SStefano Zampini           thrust::advance(rT,-1);
4449ed502f03SStefano Zampini         }
4450ed502f03SStefano Zampini         if (BT) {
4451ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4452ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4453ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4454ed502f03SStefano Zampini         }
4455ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4456ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4457ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4458ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4459ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4460ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4461ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4462ed502f03SStefano Zampini 
4463ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4464ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4465ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4466ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4467ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4468ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4469ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4470ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4471ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4472ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4473ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4474ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4475ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4476ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4477ed502f03SStefano Zampini #endif
4478ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4479ed502f03SStefano Zampini       }
4480ed502f03SStefano Zampini     }
4481ed502f03SStefano Zampini 
4482ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4483ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4484ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4485ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4486ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4487ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4488ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4489ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4490ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4491ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4492ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4493ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4494ed502f03SStefano Zampini     } else {
4495ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4496ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4497ed502f03SStefano Zampini     }
4498ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4499ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4500ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4501ed502f03SStefano Zampini     c->maxnz = c->nz;
4502ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4503ed502f03SStefano Zampini     c->rmax = 0;
4504ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4505ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4506ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4507ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4508ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4509ed502f03SStefano Zampini     }
4510ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4511ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4512ed502f03SStefano Zampini     (*C)->nonzerostate++;
4513ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4514ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4515ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4516ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4517ed502f03SStefano Zampini   } else {
4518ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4519ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4520ed502f03SStefano Zampini     if (c->nz) {
4521ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4522ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4523ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4524ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4525ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4526ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4527ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4528ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4529ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4530ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4531ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4532ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4533ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4534ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4535ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4536ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4537ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4538ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4539ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4540ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4541ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4542ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4543ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4544ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4545ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4546ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4547ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4548ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4549ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4550a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
45511a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4552ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4553ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4554ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4555ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4556ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4557ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4558ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4559ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
45601a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4561ed502f03SStefano Zampini       }
4562ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4563ed502f03SStefano Zampini     }
4564ed502f03SStefano Zampini   }
4565ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4566ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4567ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4568ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4569ed502f03SStefano Zampini   PetscFunctionReturn(0);
4570ed502f03SStefano Zampini }
4571c215019aSStefano Zampini 
4572c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4573c215019aSStefano Zampini {
4574c215019aSStefano Zampini   PetscErrorCode    ierr;
4575c215019aSStefano Zampini   bool              dmem;
4576c215019aSStefano Zampini   const PetscScalar *av;
4577c215019aSStefano Zampini   cudaError_t       cerr;
4578c215019aSStefano Zampini 
4579c215019aSStefano Zampini   PetscFunctionBegin;
4580c215019aSStefano Zampini   dmem = isCudaMem(v);
4581c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4582c215019aSStefano Zampini   if (n && idx) {
4583c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4584c215019aSStefano Zampini     widx.assign(idx,idx+n);
4585c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4586c215019aSStefano Zampini 
4587c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4588c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4589c215019aSStefano Zampini     if (dmem) {
4590c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4591c215019aSStefano Zampini     } else {
4592c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4593c215019aSStefano Zampini       dv = w->data();
4594c215019aSStefano Zampini     }
4595c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4596c215019aSStefano Zampini 
4597c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4598c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4599c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4600c215019aSStefano Zampini     if (w) {
4601c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4602c215019aSStefano Zampini     }
4603c215019aSStefano Zampini     delete w;
4604c215019aSStefano Zampini   } else {
4605c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606c215019aSStefano Zampini   }
4607c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4608c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4609c215019aSStefano Zampini   PetscFunctionReturn(0);
4610c215019aSStefano Zampini }
4611