19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96ca45077fSPaul Mullowney { 97aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 986e111a19SKarl Rupp 99ca45077fSPaul Mullowney PetscFunctionBegin; 100ca45077fSPaul Mullowney switch (op) { 101e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 102aa372e3fSPaul Mullowney cusparsestruct->format = format; 103ca45077fSPaul Mullowney break; 104e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 105aa372e3fSPaul Mullowney cusparsestruct->format = format; 106ca45077fSPaul Mullowney break; 107ca45077fSPaul Mullowney default: 10898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109ca45077fSPaul Mullowney } 110ca45077fSPaul Mullowney PetscFunctionReturn(0); 111ca45077fSPaul Mullowney } 1129ae82921SPaul Mullowney 113e057df02SPaul Mullowney /*@ 114e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 116aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 117e057df02SPaul Mullowney Not Collective 118e057df02SPaul Mullowney 119e057df02SPaul Mullowney Input Parameters: 1208468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 12136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1222692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123e057df02SPaul Mullowney 124e057df02SPaul Mullowney Output Parameter: 125e057df02SPaul Mullowney 126e057df02SPaul Mullowney Level: intermediate 127e057df02SPaul Mullowney 128db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129e057df02SPaul Mullowney @*/ 130e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131e057df02SPaul Mullowney { 132e057df02SPaul Mullowney PetscFunctionBegin; 133e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135e057df02SPaul Mullowney PetscFunctionReturn(0); 136e057df02SPaul Mullowney } 137e057df02SPaul Mullowney 138365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139365b711fSMark Adams { 140365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141365b711fSMark Adams 142365b711fSMark Adams PetscFunctionBegin; 143365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 144365b711fSMark Adams PetscFunctionReturn(0); 145365b711fSMark Adams } 146365b711fSMark Adams 147365b711fSMark Adams /*@ 148365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149365b711fSMark Adams 150365b711fSMark Adams Input Parameters: 151365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 152365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 153365b711fSMark Adams 154365b711fSMark Adams Output Parameter: 155365b711fSMark Adams 156365b711fSMark Adams Notes: 157365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 158365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160365b711fSMark Adams 161365b711fSMark Adams Level: intermediate 162365b711fSMark Adams 163db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164365b711fSMark Adams @*/ 165365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166365b711fSMark Adams { 167365b711fSMark Adams PetscFunctionBegin; 168365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170365b711fSMark Adams PetscFunctionReturn(0); 171365b711fSMark Adams } 172365b711fSMark Adams 1731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174e6e9a74fSStefano Zampini { 175e6e9a74fSStefano Zampini PetscFunctionBegin; 1761a2c6b5cSJunchao Zhang switch (op) { 1771a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1781a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1799566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1801a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1811a2c6b5cSJunchao Zhang break; 1821a2c6b5cSJunchao Zhang default: 1839566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 1841a2c6b5cSJunchao Zhang break; 185e6e9a74fSStefano Zampini } 186e6e9a74fSStefano Zampini PetscFunctionReturn(0); 187e6e9a74fSStefano Zampini } 188e6e9a74fSStefano Zampini 189bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190bddcd29dSMark Adams 191bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192bddcd29dSMark Adams { 193bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 195bddcd29dSMark Adams PetscBool row_identity,col_identity; 196365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197bddcd29dSMark Adams 198bddcd29dSMark Adams PetscFunctionBegin; 1999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2009566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 202bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2039566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 2049566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 205f93f8571SJunchao Zhang 206365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 207f93f8571SJunchao Zhang if (row_identity && col_identity) { 208bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210bddcd29dSMark Adams } else { 211bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213365b711fSMark Adams } 214f93f8571SJunchao Zhang } 215bddcd29dSMark Adams B->ops->matsolve = NULL; 216bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 217bddcd29dSMark Adams 218bddcd29dSMark Adams /* get the triangular factors */ 219365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 2209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221365b711fSMark Adams } 222bddcd29dSMark Adams PetscFunctionReturn(0); 223bddcd29dSMark Adams } 224bddcd29dSMark Adams 2254416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2269ae82921SPaul Mullowney { 227e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2289ae82921SPaul Mullowney PetscBool flg; 229a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2306e111a19SKarl Rupp 2319ae82921SPaul Mullowney PetscFunctionBegin; 232d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 2339ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 234d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2369566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237afb2bd1cSJunchao Zhang 238d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2409566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 2419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 2429566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245d0609cedSBarry Smith "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 248aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249a435da06SStefano Zampini #else 250aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251a435da06SStefano Zampini #endif 252d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253d0609cedSBarry Smith "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255afb2bd1cSJunchao Zhang 256d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257d0609cedSBarry Smith "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259afb2bd1cSJunchao Zhang #endif 2604c87dfd4SPaul Mullowney } 261d0609cedSBarry Smith PetscOptionsHeadEnd(); 2629ae82921SPaul Mullowney PetscFunctionReturn(0); 2639ae82921SPaul Mullowney } 2649ae82921SPaul Mullowney 2656fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2669ae82921SPaul Mullowney { 267da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2689ae82921SPaul Mullowney 2699ae82921SPaul Mullowney PetscFunctionBegin; 2709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2719566063dSJacob Faibussowitsch PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2729ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2739ae82921SPaul Mullowney PetscFunctionReturn(0); 2749ae82921SPaul Mullowney } 2759ae82921SPaul Mullowney 2766fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2779ae82921SPaul Mullowney { 278da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2799ae82921SPaul Mullowney 2809ae82921SPaul Mullowney PetscFunctionBegin; 2819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2829566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2839ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2849ae82921SPaul Mullowney PetscFunctionReturn(0); 2859ae82921SPaul Mullowney } 2869ae82921SPaul Mullowney 287087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 288087f3262SPaul Mullowney { 289da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 290087f3262SPaul Mullowney 291087f3262SPaul Mullowney PetscFunctionBegin; 2929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2939566063dSJacob Faibussowitsch PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 294087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 295087f3262SPaul Mullowney PetscFunctionReturn(0); 296087f3262SPaul Mullowney } 297087f3262SPaul Mullowney 298087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 299087f3262SPaul Mullowney { 300da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 301087f3262SPaul Mullowney 302087f3262SPaul Mullowney PetscFunctionBegin; 3039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 3049566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 305087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 306087f3262SPaul Mullowney PetscFunctionReturn(0); 307087f3262SPaul Mullowney } 308087f3262SPaul Mullowney 309087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3109ae82921SPaul Mullowney { 3119ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3129ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3139ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 314aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3159ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3169ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3179ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3189ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 3199ae82921SPaul Mullowney 3209ae82921SPaul Mullowney PetscFunctionBegin; 321cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 322c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3239ae82921SPaul Mullowney try { 3249ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3259ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 326da79fbbcSStefano Zampini if (!loTriFactor) { 3272cbc15d9SMark PetscScalar *AALo; 3282cbc15d9SMark 3299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 3309ae82921SPaul Mullowney 3319ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 3339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 3349ae82921SPaul Mullowney 3359ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3369ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3379ae82921SPaul Mullowney AiLo[n] = nzLower; 3389ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3399ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3409ae82921SPaul Mullowney v = aa; 3419ae82921SPaul Mullowney vi = aj; 3429ae82921SPaul Mullowney offset = 1; 3439ae82921SPaul Mullowney rowOffset= 1; 3449ae82921SPaul Mullowney for (i=1; i<n; i++) { 3459ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 346e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3479ae82921SPaul Mullowney AiLo[i] = rowOffset; 3489ae82921SPaul Mullowney rowOffset += nz+1; 3499ae82921SPaul Mullowney 3509566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 3519566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 3529ae82921SPaul Mullowney 3539ae82921SPaul Mullowney offset += nz; 3549ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3559ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3569ae82921SPaul Mullowney offset += 1; 3579ae82921SPaul Mullowney 3589ae82921SPaul Mullowney v += nz; 3599ae82921SPaul Mullowney vi += nz; 3609ae82921SPaul Mullowney } 3612205254eSKarl Rupp 362aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3639566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 364da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 365aa372e3fSPaul Mullowney /* Create the matrix description */ 3669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 3699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 370afb2bd1cSJunchao Zhang #else 3719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 372afb2bd1cSJunchao Zhang #endif 3739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 375aa372e3fSPaul Mullowney 376aa372e3fSPaul Mullowney /* set the operation */ 377aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 378aa372e3fSPaul Mullowney 379aa372e3fSPaul Mullowney /* set the matrix */ 380aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 381aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 382aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 383aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 384aa372e3fSPaul Mullowney 385aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 386aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 387aa372e3fSPaul Mullowney 388aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 389aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 390aa372e3fSPaul Mullowney 391aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 392aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 393aa372e3fSPaul Mullowney 394afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3959566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 396*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 398*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 399afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 400afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 401afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 4025f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 4039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 404afb2bd1cSJunchao Zhang #endif 405afb2bd1cSJunchao Zhang 406aa372e3fSPaul Mullowney /* perform the solve analysis */ 407*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 408aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 409aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 410d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 4111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 412d49cd2b7SBarry Smith loTriFactor->solveInfo, 4135f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 414d49cd2b7SBarry Smith #else 4155f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 416afb2bd1cSJunchao Zhang #endif 4179566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4189566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 419aa372e3fSPaul Mullowney 420da79fbbcSStefano Zampini /* assign the pointer */ 421aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4222cbc15d9SMark loTriFactor->AA_h = AALo; 4239566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4249566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4259566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 426da79fbbcSStefano Zampini } else { /* update values only */ 4272cbc15d9SMark if (!loTriFactor->AA_h) { 4289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 4292cbc15d9SMark } 430da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4312cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 432da79fbbcSStefano Zampini v = aa; 433da79fbbcSStefano Zampini vi = aj; 434da79fbbcSStefano Zampini offset = 1; 435da79fbbcSStefano Zampini for (i=1; i<n; i++) { 436da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4379566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 438da79fbbcSStefano Zampini offset += nz; 4392cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 440da79fbbcSStefano Zampini offset += 1; 441da79fbbcSStefano Zampini v += nz; 442da79fbbcSStefano Zampini } 4432cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 4449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 445da79fbbcSStefano Zampini } 4469ae82921SPaul Mullowney } catch(char *ex) { 44798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4489ae82921SPaul Mullowney } 4499ae82921SPaul Mullowney } 4509ae82921SPaul Mullowney PetscFunctionReturn(0); 4519ae82921SPaul Mullowney } 4529ae82921SPaul Mullowney 453087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4549ae82921SPaul Mullowney { 4559ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4569ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4579ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 458aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4599ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4609ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4619ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4629ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4639ae82921SPaul Mullowney 4649ae82921SPaul Mullowney PetscFunctionBegin; 465cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 466c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4679ae82921SPaul Mullowney try { 4689ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4699ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 470da79fbbcSStefano Zampini if (!upTriFactor) { 4712cbc15d9SMark PetscScalar *AAUp; 4722cbc15d9SMark 4739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 4742cbc15d9SMark 4759ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 4779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 4789ae82921SPaul Mullowney 4799ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4809ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4819ae82921SPaul Mullowney AiUp[n]=nzUpper; 4829ae82921SPaul Mullowney offset = nzUpper; 4839ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4849ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4859ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4869ae82921SPaul Mullowney 487e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4889ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4899ae82921SPaul Mullowney 490e057df02SPaul Mullowney /* decrement the offset */ 4919ae82921SPaul Mullowney offset -= (nz+1); 4929ae82921SPaul Mullowney 493e057df02SPaul Mullowney /* first, set the diagonal elements */ 4949ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 49509f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4969ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4979ae82921SPaul Mullowney 4989566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 4999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 5009ae82921SPaul Mullowney } 5012205254eSKarl Rupp 502aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5039566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 504da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5052205254eSKarl Rupp 506aa372e3fSPaul Mullowney /* Create the matrix description */ 5079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 5109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511afb2bd1cSJunchao Zhang #else 5129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513afb2bd1cSJunchao Zhang #endif 5149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516aa372e3fSPaul Mullowney 517aa372e3fSPaul Mullowney /* set the operation */ 518aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519aa372e3fSPaul Mullowney 520aa372e3fSPaul Mullowney /* set the matrix */ 521aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 522aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 523aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 524aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 525aa372e3fSPaul Mullowney 526aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 527aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 528aa372e3fSPaul Mullowney 529aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 531aa372e3fSPaul Mullowney 532aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 534aa372e3fSPaul Mullowney 535afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 5369566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 537*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 5381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 539*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 540afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 541afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 542afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 5435f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 5449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 545afb2bd1cSJunchao Zhang #endif 546afb2bd1cSJunchao Zhang 547aa372e3fSPaul Mullowney /* perform the solve analysis */ 548*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 549aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 550aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 551d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 5521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 553d49cd2b7SBarry Smith upTriFactor->solveInfo, 5545f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 555d49cd2b7SBarry Smith #else 5565f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 557afb2bd1cSJunchao Zhang #endif 5589566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5599566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 560aa372e3fSPaul Mullowney 561da79fbbcSStefano Zampini /* assign the pointer */ 562aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5632cbc15d9SMark upTriFactor->AA_h = AAUp; 5649566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5659566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5669566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 567da79fbbcSStefano Zampini } else { 5682cbc15d9SMark if (!upTriFactor->AA_h) { 5699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 5702cbc15d9SMark } 571da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 572da79fbbcSStefano Zampini offset = nzUpper; 573da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 574da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 575da79fbbcSStefano Zampini 576da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 577da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 578da79fbbcSStefano Zampini 579da79fbbcSStefano Zampini /* decrement the offset */ 580da79fbbcSStefano Zampini offset -= (nz+1); 581da79fbbcSStefano Zampini 582da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5832cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 5849566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 585da79fbbcSStefano Zampini } 5862cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 5879566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 588da79fbbcSStefano Zampini } 5899ae82921SPaul Mullowney } catch(char *ex) { 59098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5919ae82921SPaul Mullowney } 5929ae82921SPaul Mullowney } 5939ae82921SPaul Mullowney PetscFunctionReturn(0); 5949ae82921SPaul Mullowney } 5959ae82921SPaul Mullowney 596087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5979ae82921SPaul Mullowney { 5989ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5999ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6009ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6019ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6029ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6039ae82921SPaul Mullowney 6049ae82921SPaul Mullowney PetscFunctionBegin; 60528b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 6069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 6079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 6082205254eSKarl Rupp 609da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 610aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6119ae82921SPaul Mullowney 612c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 613e057df02SPaul Mullowney /* lower triangular indices */ 6149566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 615da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 616da79fbbcSStefano Zampini const PetscInt *r; 617da79fbbcSStefano Zampini 6189566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r)); 619aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 620aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6219566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r)); 6229566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 623da79fbbcSStefano Zampini } 6249ae82921SPaul Mullowney 625e057df02SPaul Mullowney /* upper triangular indices */ 6269566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 627da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 628da79fbbcSStefano Zampini const PetscInt *c; 629da79fbbcSStefano Zampini 6309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&c)); 631aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 632aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6339566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&c)); 6349566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 635da79fbbcSStefano Zampini } 6369ae82921SPaul Mullowney PetscFunctionReturn(0); 6379ae82921SPaul Mullowney } 6389ae82921SPaul Mullowney 639087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 640087f3262SPaul Mullowney { 641087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 642087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 643aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 644aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 645087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 646087f3262SPaul Mullowney PetscScalar *AAUp; 647087f3262SPaul Mullowney PetscScalar *AALo; 648087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 649087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 650087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 651087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 652087f3262SPaul Mullowney 653087f3262SPaul Mullowney PetscFunctionBegin; 654cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 655c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 656087f3262SPaul Mullowney try { 6579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 6589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 659da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 660087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 6619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 6629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 663087f3262SPaul Mullowney 664087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 665087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 666087f3262SPaul Mullowney AiUp[n]=nzUpper; 667087f3262SPaul Mullowney offset = 0; 668087f3262SPaul Mullowney for (i=0; i<n; i++) { 669087f3262SPaul Mullowney /* set the pointers */ 670087f3262SPaul Mullowney v = aa + ai[i]; 671087f3262SPaul Mullowney vj = aj + ai[i]; 672087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 673087f3262SPaul Mullowney 674087f3262SPaul Mullowney /* first, set the diagonal elements */ 675087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 67609f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 677087f3262SPaul Mullowney AiUp[i] = offset; 67809f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 679087f3262SPaul Mullowney 680087f3262SPaul Mullowney offset+=1; 681087f3262SPaul Mullowney if (nz>0) { 6829566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 6839566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 684087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 685087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 686087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 687087f3262SPaul Mullowney } 688087f3262SPaul Mullowney offset+=nz; 689087f3262SPaul Mullowney } 690087f3262SPaul Mullowney } 691087f3262SPaul Mullowney 692aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6939566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 694da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 695087f3262SPaul Mullowney 696aa372e3fSPaul Mullowney /* Create the matrix description */ 6979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 701afb2bd1cSJunchao Zhang #else 7029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 703afb2bd1cSJunchao Zhang #endif 7049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7059566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 706087f3262SPaul Mullowney 707aa372e3fSPaul Mullowney /* set the matrix */ 708aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 709aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 710aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 711aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 712aa372e3fSPaul Mullowney 713aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 714aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 715aa372e3fSPaul Mullowney 716aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 717aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 718aa372e3fSPaul Mullowney 719aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 720aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 721aa372e3fSPaul Mullowney 722afb2bd1cSJunchao Zhang /* set the operation */ 723afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 724afb2bd1cSJunchao Zhang 725afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7269566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 727*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 7281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 729*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 730afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 731afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 732afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 7335f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 7349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 735afb2bd1cSJunchao Zhang #endif 736afb2bd1cSJunchao Zhang 737aa372e3fSPaul Mullowney /* perform the solve analysis */ 738*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 739aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 740aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 741d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 7421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 743d49cd2b7SBarry Smith upTriFactor->solveInfo, 7445f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 745d49cd2b7SBarry Smith #else 7465f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 747afb2bd1cSJunchao Zhang #endif 7489566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7499566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 750aa372e3fSPaul Mullowney 751da79fbbcSStefano Zampini /* assign the pointer */ 752aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 753aa372e3fSPaul Mullowney 754aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7559566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 756da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 757aa372e3fSPaul Mullowney 758aa372e3fSPaul Mullowney /* Create the matrix description */ 7599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 7609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 7611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 763afb2bd1cSJunchao Zhang #else 7649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 765afb2bd1cSJunchao Zhang #endif 7669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 768aa372e3fSPaul Mullowney 769aa372e3fSPaul Mullowney /* set the operation */ 770aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 771aa372e3fSPaul Mullowney 772aa372e3fSPaul Mullowney /* set the matrix */ 773aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 774aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 775aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 776aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 777aa372e3fSPaul Mullowney 778aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 779aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 780aa372e3fSPaul Mullowney 781aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 782aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 783aa372e3fSPaul Mullowney 784aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 785aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 786aa372e3fSPaul Mullowney 787afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7889566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 789*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 7901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 791*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 792afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 793afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 794afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 7955f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 7969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 797afb2bd1cSJunchao Zhang #endif 798afb2bd1cSJunchao Zhang 799aa372e3fSPaul Mullowney /* perform the solve analysis */ 800*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 801aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 802aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 803d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 8041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 805d49cd2b7SBarry Smith loTriFactor->solveInfo, 8065f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 807d49cd2b7SBarry Smith #else 8085f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 809afb2bd1cSJunchao Zhang #endif 8109566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8119566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 812aa372e3fSPaul Mullowney 813da79fbbcSStefano Zampini /* assign the pointer */ 814aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 815087f3262SPaul Mullowney 8169566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 8179566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 8189566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 819da79fbbcSStefano Zampini } else { 820da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 821da79fbbcSStefano Zampini offset = 0; 822da79fbbcSStefano Zampini for (i=0; i<n; i++) { 823da79fbbcSStefano Zampini /* set the pointers */ 824da79fbbcSStefano Zampini v = aa + ai[i]; 825da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 826da79fbbcSStefano Zampini 827da79fbbcSStefano Zampini /* first, set the diagonal elements */ 828da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 829da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 830da79fbbcSStefano Zampini 831da79fbbcSStefano Zampini offset+=1; 832da79fbbcSStefano Zampini if (nz>0) { 8339566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 834da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 835da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 836da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 837da79fbbcSStefano Zampini } 838da79fbbcSStefano Zampini offset+=nz; 839da79fbbcSStefano Zampini } 840da79fbbcSStefano Zampini } 84128b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 84228b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 843da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 844da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 8459566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 846da79fbbcSStefano Zampini } 8479566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 8489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 849087f3262SPaul Mullowney } catch(char *ex) { 85098921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 851087f3262SPaul Mullowney } 852087f3262SPaul Mullowney } 853087f3262SPaul Mullowney PetscFunctionReturn(0); 854087f3262SPaul Mullowney } 855087f3262SPaul Mullowney 856087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8579ae82921SPaul Mullowney { 858087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 859087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 860087f3262SPaul Mullowney IS ip = a->row; 861087f3262SPaul Mullowney PetscBool perm_identity; 862087f3262SPaul Mullowney PetscInt n = A->rmap->n; 863087f3262SPaul Mullowney 864087f3262SPaul Mullowney PetscFunctionBegin; 86528b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 8669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 867da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 868aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 869aa372e3fSPaul Mullowney 870da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 871da79fbbcSStefano Zampini 872087f3262SPaul Mullowney /* lower triangular indices */ 8739566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 874087f3262SPaul Mullowney if (!perm_identity) { 8754e4bbfaaSStefano Zampini IS iip; 876da79fbbcSStefano Zampini const PetscInt *irip,*rip; 8774e4bbfaaSStefano Zampini 8789566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 8799566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip,&irip)); 8809566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip,&rip)); 881aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 882aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 883aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8844e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8859566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip,&irip)); 8869566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 8879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip,&rip)); 8889566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 889da79fbbcSStefano Zampini } 890087f3262SPaul Mullowney PetscFunctionReturn(0); 891087f3262SPaul Mullowney } 892087f3262SPaul Mullowney 893087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 894087f3262SPaul Mullowney { 895087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 896087f3262SPaul Mullowney IS ip = b->row; 897087f3262SPaul Mullowney PetscBool perm_identity; 898087f3262SPaul Mullowney 899087f3262SPaul Mullowney PetscFunctionBegin; 9009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 9019566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 902ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 903087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9049566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 905087f3262SPaul Mullowney if (perm_identity) { 906087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 907087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9084e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9094e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 910087f3262SPaul Mullowney } else { 911087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 912087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9134e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9144e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 915087f3262SPaul Mullowney } 916087f3262SPaul Mullowney 917087f3262SPaul Mullowney /* get the triangular factors */ 9189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 919087f3262SPaul Mullowney PetscFunctionReturn(0); 920087f3262SPaul Mullowney } 9219ae82921SPaul Mullowney 922b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 923bda325fcSPaul Mullowney { 924bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 925aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 926aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 927da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 928da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 929aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 930aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 931aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 932aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 933b175d8bbSPaul Mullowney 934bda325fcSPaul Mullowney PetscFunctionBegin; 935aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 9369566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 937da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 938aa372e3fSPaul Mullowney 939aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 940aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 941aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 942aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 943aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 944aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 945aa372e3fSPaul Mullowney 946aa372e3fSPaul Mullowney /* Create the matrix description */ 9479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 9489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 9499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 9509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 9519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 952aa372e3fSPaul Mullowney 953aa372e3fSPaul Mullowney /* set the operation */ 954aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 955aa372e3fSPaul Mullowney 956aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 957aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 958afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 959afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 960aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 961afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 962afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 963afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 964aa372e3fSPaul Mullowney 965aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 966afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 9679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 968afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 969afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 970afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 971afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 972afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 973afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 974afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 9755f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 9769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 977afb2bd1cSJunchao Zhang #endif 978afb2bd1cSJunchao Zhang 9799566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 9809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 981aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 982aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 983aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 984aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 985aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 986afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 987afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 988afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 9895f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 990afb2bd1cSJunchao Zhang #else 991afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 9925f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 993afb2bd1cSJunchao Zhang #endif 9949566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9959566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 996aa372e3fSPaul Mullowney 997afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9989566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 999*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 10001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1001*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1002afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1003afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1004afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 10055f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 10069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 1007afb2bd1cSJunchao Zhang #endif 1008afb2bd1cSJunchao Zhang 1009afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1010*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1011afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1012afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1013d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 10141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1015d49cd2b7SBarry Smith loTriFactorT->solveInfo, 10165f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1017d49cd2b7SBarry Smith #else 10185f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 1019afb2bd1cSJunchao Zhang #endif 10209566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10219566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1022aa372e3fSPaul Mullowney 1023da79fbbcSStefano Zampini /* assign the pointer */ 1024aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1025aa372e3fSPaul Mullowney 1026aa372e3fSPaul Mullowney /*********************************************/ 1027aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1028aa372e3fSPaul Mullowney /*********************************************/ 1029aa372e3fSPaul Mullowney 1030aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 10319566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1032da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1033aa372e3fSPaul Mullowney 1034aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1035aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1036aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1037aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1038aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1039aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1040aa372e3fSPaul Mullowney 1041aa372e3fSPaul Mullowney /* Create the matrix description */ 10429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 10439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 10449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 10459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 10469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1047aa372e3fSPaul Mullowney 1048aa372e3fSPaul Mullowney /* set the operation */ 1049aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1050aa372e3fSPaul Mullowney 1051aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1052aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1053afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1054afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1055aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1056afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1057afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1058afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1059aa372e3fSPaul Mullowney 1060aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1061afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1063afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1064afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1065afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1066afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1067afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1068afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1069afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10705f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 10719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1072afb2bd1cSJunchao Zhang #endif 1073afb2bd1cSJunchao Zhang 10749566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1076aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1077aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1078aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1079aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1080aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1081afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1082afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1083afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10845f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1085afb2bd1cSJunchao Zhang #else 1086afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 10875f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1088afb2bd1cSJunchao Zhang #endif 1089d49cd2b7SBarry Smith 10909566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10919566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1092aa372e3fSPaul Mullowney 1093afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 10949566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1095*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 10961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1097*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1098afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1099afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1100afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 11015f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 11029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1103afb2bd1cSJunchao Zhang #endif 1104afb2bd1cSJunchao Zhang 1105afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11065f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 1107*261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1108afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1109afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1110d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 11111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1112d49cd2b7SBarry Smith upTriFactorT->solveInfo, 11135f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1114d49cd2b7SBarry Smith #else 11155f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1116afb2bd1cSJunchao Zhang #endif 1117d49cd2b7SBarry Smith 11189566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11199566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1120aa372e3fSPaul Mullowney 1121da79fbbcSStefano Zampini /* assign the pointer */ 1122aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1123bda325fcSPaul Mullowney PetscFunctionReturn(0); 1124bda325fcSPaul Mullowney } 1125bda325fcSPaul Mullowney 1126a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1127a49f1ed0SStefano Zampini { 1128a49f1ed0SStefano Zampini __host__ __device__ 1129a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1130a49f1ed0SStefano Zampini { 1131a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1132a49f1ed0SStefano Zampini } 1133a49f1ed0SStefano Zampini }; 1134a49f1ed0SStefano Zampini 11353606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1136bda325fcSPaul Mullowney { 1137aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1138a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1139bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1140bda325fcSPaul Mullowney cusparseStatus_t stat; 1141aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1142b175d8bbSPaul Mullowney 1143bda325fcSPaul Mullowney PetscFunctionBegin; 11449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1145a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 114628b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1147a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 114808401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 11491a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 11509566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1152a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 11539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1154a49f1ed0SStefano Zampini } 1155a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1156aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 11579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1158aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 11599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 11609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1161aa372e3fSPaul Mullowney 1162b06137fdSPaul Mullowney /* set alpha and beta */ 11639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 11649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 11659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 11669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1169b06137fdSPaul Mullowney 1170aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1171aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1172a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1173554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1174554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1175aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1176a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1177aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1178aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1179a3fdcf43SKarl Rupp 1180039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 118181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1182afb2bd1cSJunchao Zhang 1183afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11843606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1185afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1186afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1187afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1188afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1189afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 11909566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 11913606e59fSJunchao Zhang #else 11923606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 11933606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 11943606e59fSJunchao Zhang 11953606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 11963606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 11973606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 11983606e59fSJunchao Zhang */ 11993606e59fSJunchao Zhang if (matrixT->num_entries) { 12003606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 12013606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 12023606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 12033606e59fSJunchao Zhang matrixT->values->data().get(), 12043606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 12059566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 12063606e59fSJunchao Zhang 12073606e59fSJunchao Zhang } else { 12083606e59fSJunchao Zhang matstructT->matDescr = NULL; 12093606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12103606e59fSJunchao Zhang } 12113606e59fSJunchao Zhang #endif 1212afb2bd1cSJunchao Zhang #endif 1213aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1214afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1215afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1216afb2bd1cSJunchao Zhang #else 1217aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 121851c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 121951c6d536SStefano Zampini /* First convert HYB to CSR */ 1220aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1221aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1222aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1223aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1224aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1225aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1226aa372e3fSPaul Mullowney 1227aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1228aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1229aa372e3fSPaul Mullowney temp->values->data().get(), 1230aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 12319566063dSJacob Faibussowitsch temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1232aa372e3fSPaul Mullowney 1233aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1234aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1235aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1236aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1237aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1238aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1239aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1240aa372e3fSPaul Mullowney 1241aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1242aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1243aa372e3fSPaul Mullowney temp->values->data().get(), 1244aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1245aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1246aa372e3fSPaul Mullowney tempT->values->data().get(), 1247aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1248aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 12499566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1250aa372e3fSPaul Mullowney 1251aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1252aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 12539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1254aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1255aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1256aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1257aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1258aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1259aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 12609566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1261aa372e3fSPaul Mullowney 1262aa372e3fSPaul Mullowney /* assign the pointer */ 1263aa372e3fSPaul Mullowney matstructT->mat = hybMat; 12641a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1265aa372e3fSPaul Mullowney /* delete temporaries */ 1266aa372e3fSPaul Mullowney if (tempT) { 1267aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1268aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1269aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1270aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1271087f3262SPaul Mullowney } 1272aa372e3fSPaul Mullowney if (temp) { 1273aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1274aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1275aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1276aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1277aa372e3fSPaul Mullowney } 1278afb2bd1cSJunchao Zhang #endif 1279aa372e3fSPaul Mullowney } 1280a49f1ed0SStefano Zampini } 1281a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1282a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1283a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 128428b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 128528b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 128628b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 128728b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 128828b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 128928b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 129028b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 129128b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1292a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1293a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1294a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 12959566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1296a49f1ed0SStefano Zampini } 1297a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1298a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1299a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1300a49f1ed0SStefano Zampini 1301a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1302a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1303a49f1ed0SStefano Zampini void *csr2cscBuffer; 1304a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1305a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1306a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1307a49f1ed0SStefano Zampini matrix->values->data().get(), 1308a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1309a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1310a49f1ed0SStefano Zampini matrixT->values->data().get(), 1311a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1312a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13139566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 13149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1315a49f1ed0SStefano Zampini #endif 1316a49f1ed0SStefano Zampini 13171a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13181a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13191a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13201a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13211a2c6b5cSJunchao Zhang 13221a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13231a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13241a2c6b5cSJunchao Zhang */ 13251a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 13261a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 13271a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 13281a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 13291a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1330a49f1ed0SStefano Zampini matrixT->values->data().get(), 1331a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1332a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1333a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 13349566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1335a49f1ed0SStefano Zampini #else 1336a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 13379566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1338a49f1ed0SStefano Zampini #endif 13391a2c6b5cSJunchao Zhang } else { 13401a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 13411a2c6b5cSJunchao Zhang } 13421a2c6b5cSJunchao Zhang 1343a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1344a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1345a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 13469566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1347a49f1ed0SStefano Zampini #endif 1348a49f1ed0SStefano Zampini } 1349a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1350a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1351a49f1ed0SStefano Zampini matrixT->values->begin())); 1352a49f1ed0SStefano Zampini } 13539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13549566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1355213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1356213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1357aa372e3fSPaul Mullowney /* assign the pointer */ 1358aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 13591a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1360bda325fcSPaul Mullowney PetscFunctionReturn(0); 1361bda325fcSPaul Mullowney } 1362bda325fcSPaul Mullowney 1363a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1365bda325fcSPaul Mullowney { 1366c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1367465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1368465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1369465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1370465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1371bda325fcSPaul Mullowney cusparseStatus_t stat; 1372bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1373aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1374aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1375aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1376bda325fcSPaul Mullowney 1377bda325fcSPaul Mullowney PetscFunctionBegin; 1378aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1379aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 13809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1381aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1382aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1383bda325fcSPaul Mullowney } 1384bda325fcSPaul Mullowney 1385bda325fcSPaul Mullowney /* Get the GPU pointers */ 13869566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 13879566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1388c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1389c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1390bda325fcSPaul Mullowney 13919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1392aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1393a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1394c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1395c41cb2e2SAlejandro Lamas Daviña xGPU); 1396aa372e3fSPaul Mullowney 1397aa372e3fSPaul Mullowney /* First, solve U */ 1398*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1399afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1401afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1402afb2bd1cSJunchao Zhang #endif 1403afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1404aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1405aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1406aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1407aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1408d49cd2b7SBarry Smith xarray, 14091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1410d49cd2b7SBarry Smith tempGPU->data().get(), 14119566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1412d49cd2b7SBarry Smith #else 14139566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1414afb2bd1cSJunchao Zhang #endif 1415aa372e3fSPaul Mullowney 1416aa372e3fSPaul Mullowney /* Then, solve L */ 1417*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1418afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14191b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1420afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1421afb2bd1cSJunchao Zhang #endif 1422afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1423aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1424aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1425aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1426aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1427d49cd2b7SBarry Smith tempGPU->data().get(), 14281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1429d49cd2b7SBarry Smith xarray, 14309566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1431d49cd2b7SBarry Smith #else 14329566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1433afb2bd1cSJunchao Zhang #endif 1434aa372e3fSPaul Mullowney 1435aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1436a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1437c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1438aa372e3fSPaul Mullowney tempGPU->begin()); 1439aa372e3fSPaul Mullowney 1440aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1441a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1442bda325fcSPaul Mullowney 1443bda325fcSPaul Mullowney /* restore */ 14449566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 14459566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 14469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1448bda325fcSPaul Mullowney PetscFunctionReturn(0); 1449bda325fcSPaul Mullowney } 1450bda325fcSPaul Mullowney 14516fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1452bda325fcSPaul Mullowney { 1453465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1454465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1455bda325fcSPaul Mullowney cusparseStatus_t stat; 1456bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1457aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1458aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1459aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1460bda325fcSPaul Mullowney 1461bda325fcSPaul Mullowney PetscFunctionBegin; 1462aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1463aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1465aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1466aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1467bda325fcSPaul Mullowney } 1468bda325fcSPaul Mullowney 1469bda325fcSPaul Mullowney /* Get the GPU pointers */ 14709566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14719566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1472bda325fcSPaul Mullowney 14739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1474aa372e3fSPaul Mullowney /* First, solve U */ 1475*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1476afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14771b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1478afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1479afb2bd1cSJunchao Zhang #endif 1480afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1481aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1482aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1483aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1484aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1485d49cd2b7SBarry Smith barray, 14861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1487d49cd2b7SBarry Smith tempGPU->data().get(), 14889566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1489d49cd2b7SBarry Smith #else 14909566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1491afb2bd1cSJunchao Zhang #endif 1492aa372e3fSPaul Mullowney 1493aa372e3fSPaul Mullowney /* Then, solve L */ 1494*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1495afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1497afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1498afb2bd1cSJunchao Zhang #endif 1499afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1500aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1501aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1502aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1503aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1504d49cd2b7SBarry Smith tempGPU->data().get(), 15051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506d49cd2b7SBarry Smith xarray, 15079566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1508d49cd2b7SBarry Smith #else 15099566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1510afb2bd1cSJunchao Zhang #endif 1511bda325fcSPaul Mullowney 1512bda325fcSPaul Mullowney /* restore */ 15139566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15149566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1517bda325fcSPaul Mullowney PetscFunctionReturn(0); 1518bda325fcSPaul Mullowney } 1519bda325fcSPaul Mullowney 15206fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15219ae82921SPaul Mullowney { 1522465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1523465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1524465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1525465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15269ae82921SPaul Mullowney cusparseStatus_t stat; 15279ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1528aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1529aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1530aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 15319ae82921SPaul Mullowney 15329ae82921SPaul Mullowney PetscFunctionBegin; 1533ebc8f436SDominic Meiser 1534e057df02SPaul Mullowney /* Get the GPU pointers */ 15359566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 15369566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1537c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1538c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15399ae82921SPaul Mullowney 15409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1541aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1542a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1543c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15444e4bbfaaSStefano Zampini tempGPU->begin()); 1545aa372e3fSPaul Mullowney 1546aa372e3fSPaul Mullowney /* Next, solve L */ 1547*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1548afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15491b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1550afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1551afb2bd1cSJunchao Zhang #endif 1552afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1553aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1554aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1555aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1556aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1557d49cd2b7SBarry Smith tempGPU->data().get(), 15581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1559d49cd2b7SBarry Smith xarray, 15609566063dSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1561d49cd2b7SBarry Smith #else 15629566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1563afb2bd1cSJunchao Zhang #endif 1564aa372e3fSPaul Mullowney 1565aa372e3fSPaul Mullowney /* Then, solve U */ 1566*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1567afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1570afb2bd1cSJunchao Zhang #endif 1571afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1572aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1573aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1574aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1575d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 15761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1577d49cd2b7SBarry Smith tempGPU->data().get(), 15789566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1579d49cd2b7SBarry Smith #else 15809566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1581afb2bd1cSJunchao Zhang #endif 1582d49cd2b7SBarry Smith 15834e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1584a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15854e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15864e4bbfaaSStefano Zampini xGPU); 15879ae82921SPaul Mullowney 15889566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15899566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 15929ae82921SPaul Mullowney PetscFunctionReturn(0); 15939ae82921SPaul Mullowney } 15949ae82921SPaul Mullowney 15956fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15969ae82921SPaul Mullowney { 1597465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1598465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15999ae82921SPaul Mullowney cusparseStatus_t stat; 16009ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1601aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1602aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1603aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 16049ae82921SPaul Mullowney 16059ae82921SPaul Mullowney PetscFunctionBegin; 1606e057df02SPaul Mullowney /* Get the GPU pointers */ 16079566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 16089566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 16099ae82921SPaul Mullowney 16109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1611aa372e3fSPaul Mullowney /* First, solve L */ 1612*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1613afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1615afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1616afb2bd1cSJunchao Zhang #endif 1617afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1618aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1619aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1620aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1621aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1622d49cd2b7SBarry Smith barray, 16231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1624d49cd2b7SBarry Smith tempGPU->data().get(), 16259566063dSJacob Faibussowitsch loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1626d49cd2b7SBarry Smith #else 16279566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1628afb2bd1cSJunchao Zhang #endif 1629d49cd2b7SBarry Smith 1630aa372e3fSPaul Mullowney /* Next, solve U */ 1631*261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1632afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1635afb2bd1cSJunchao Zhang #endif 1636afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1637aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1638aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1639aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1640aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1641d49cd2b7SBarry Smith tempGPU->data().get(), 16421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1643d49cd2b7SBarry Smith xarray, 16449566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1645d49cd2b7SBarry Smith #else 16469566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1647afb2bd1cSJunchao Zhang #endif 16489ae82921SPaul Mullowney 16499566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 16509566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 16519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16539ae82921SPaul Mullowney PetscFunctionReturn(0); 16549ae82921SPaul Mullowney } 16559ae82921SPaul Mullowney 1656841d4cb1SJunchao Zhang PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1657841d4cb1SJunchao Zhang { 1658841d4cb1SJunchao Zhang PetscFunctionBegin; 1659841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 1660841d4cb1SJunchao Zhang PetscFunctionReturn(0); 1661841d4cb1SJunchao Zhang } 1662841d4cb1SJunchao Zhang 1663841d4cb1SJunchao Zhang /*MC 1664841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 1665841d4cb1SJunchao Zhang on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 1666841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 1667841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 1668841d4cb1SJunchao Zhang CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 1669841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 1670841d4cb1SJunchao Zhang 1671841d4cb1SJunchao Zhang Level: beginner 1672841d4cb1SJunchao Zhang 1673841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 1674841d4cb1SJunchao Zhang M*/ 1675841d4cb1SJunchao Zhang 1676841d4cb1SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1677841d4cb1SJunchao Zhang { 1678841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 1679841d4cb1SJunchao Zhang 1680841d4cb1SJunchao Zhang PetscFunctionBegin; 1681841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 1682841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B,n,n,n,n)); 1683841d4cb1SJunchao Zhang (*B)->factortype = ftype; 1684841d4cb1SJunchao Zhang PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 1685841d4cb1SJunchao Zhang 1686841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 1687841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 1688841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 1689841d4cb1SJunchao Zhang if (!A->boundtocpu) { 1690841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1691841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 1692841d4cb1SJunchao Zhang } else { 1693841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 1694841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 1695841d4cb1SJunchao Zhang } 1696841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 1697841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 1698841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 1699841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 1700841d4cb1SJunchao Zhang if (!A->boundtocpu) { 1701841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 1702841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1703841d4cb1SJunchao Zhang } else { 1704841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 1705841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 1706841d4cb1SJunchao Zhang } 1707841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 1708841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 1709841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 1710841d4cb1SJunchao Zhang 1711841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 1712841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 1713841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 1714841d4cb1SJunchao Zhang PetscFunctionReturn(0); 1715841d4cb1SJunchao Zhang } 1716841d4cb1SJunchao Zhang 17177e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 17187e8381f9SStefano Zampini { 17197e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 17207e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 17217e8381f9SStefano Zampini 17227e8381f9SStefano Zampini PetscFunctionBegin; 17237e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 17247e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 17257e8381f9SStefano Zampini 17269566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 17289566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 17299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 17309566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 17317e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 17327e8381f9SStefano Zampini } 17337e8381f9SStefano Zampini PetscFunctionReturn(0); 17347e8381f9SStefano Zampini } 17357e8381f9SStefano Zampini 17367e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 17377e8381f9SStefano Zampini { 17387e8381f9SStefano Zampini PetscFunctionBegin; 17399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 174067a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 174167a45760SJunchao Zhang PetscFunctionReturn(0); 174267a45760SJunchao Zhang } 174367a45760SJunchao Zhang 174467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 174567a45760SJunchao Zhang { 174667a45760SJunchao Zhang PetscFunctionBegin; 17477e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 174867a45760SJunchao Zhang *array = NULL; 174967a45760SJunchao Zhang PetscFunctionReturn(0); 175067a45760SJunchao Zhang } 175167a45760SJunchao Zhang 175267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 175367a45760SJunchao Zhang { 175467a45760SJunchao Zhang PetscFunctionBegin; 17559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 175667a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 175767a45760SJunchao Zhang PetscFunctionReturn(0); 175867a45760SJunchao Zhang } 175967a45760SJunchao Zhang 176067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 176167a45760SJunchao Zhang { 176267a45760SJunchao Zhang PetscFunctionBegin; 176367a45760SJunchao Zhang *array = NULL; 176467a45760SJunchao Zhang PetscFunctionReturn(0); 176567a45760SJunchao Zhang } 176667a45760SJunchao Zhang 176767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 176867a45760SJunchao Zhang { 176967a45760SJunchao Zhang PetscFunctionBegin; 177067a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 177167a45760SJunchao Zhang PetscFunctionReturn(0); 177267a45760SJunchao Zhang } 177367a45760SJunchao Zhang 177467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 177567a45760SJunchao Zhang { 177667a45760SJunchao Zhang PetscFunctionBegin; 177767a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 177867a45760SJunchao Zhang *array = NULL; 17797e8381f9SStefano Zampini PetscFunctionReturn(0); 17807e8381f9SStefano Zampini } 17817e8381f9SStefano Zampini 17827ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 17837ee59b9bSJunchao Zhang { 17847ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 17857ee59b9bSJunchao Zhang CsrMatrix *matrix; 17867ee59b9bSJunchao Zhang 17877ee59b9bSJunchao Zhang PetscFunctionBegin; 17887ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 17897ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 17907ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 17917ee59b9bSJunchao Zhang PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 17927ee59b9bSJunchao Zhang matrix = (CsrMatrix*)cusp->mat->mat; 17937ee59b9bSJunchao Zhang 17947ee59b9bSJunchao Zhang if (i) { 17957ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 17967ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 17977ee59b9bSJunchao Zhang #else 17987ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 17997ee59b9bSJunchao Zhang #endif 18007ee59b9bSJunchao Zhang } 18017ee59b9bSJunchao Zhang if (j) { 18027ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 18037ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 18047ee59b9bSJunchao Zhang #else 18057ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 18067ee59b9bSJunchao Zhang #endif 18077ee59b9bSJunchao Zhang } 18087ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 18097ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 18107ee59b9bSJunchao Zhang PetscFunctionReturn(0); 18117ee59b9bSJunchao Zhang } 18127ee59b9bSJunchao Zhang 1813042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 18149ae82921SPaul Mullowney { 1815aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 18167c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 18179ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1818213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 1819aa372e3fSPaul Mullowney cusparseStatus_t stat; 1820abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 18219ae82921SPaul Mullowney 18229ae82921SPaul Mullowney PetscFunctionBegin; 182328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 1824c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1825a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 1826a49f1ed0SStefano Zampini CsrMatrix *matrix; 1827afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 182885ba7357SStefano Zampini 182908401ef6SPierre Jolivet PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 18309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 1831afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 18329566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 18339566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 18349566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 183634d6c7a5SJose E. Roman } else { 1837abb89eb1SStefano Zampini PetscInt nnz; 18389566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 18399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 18409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 18417c700b8dSJunchao Zhang delete cusparsestruct->workVector; 184281902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 1843a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 1844a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 18459ae82921SPaul Mullowney try { 18469ae82921SPaul Mullowney if (a->compressedrow.use) { 18479ae82921SPaul Mullowney m = a->compressedrow.nrows; 18489ae82921SPaul Mullowney ii = a->compressedrow.i; 18499ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 18509ae82921SPaul Mullowney } else { 1851213423ffSJunchao Zhang m = A->rmap->n; 1852213423ffSJunchao Zhang ii = a->i; 1853e6e9a74fSStefano Zampini ridx = NULL; 18549ae82921SPaul Mullowney } 185508401ef6SPierre Jolivet PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 1856abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 1857abb89eb1SStefano Zampini else nnz = a->nz; 185808401ef6SPierre Jolivet PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 18599ae82921SPaul Mullowney 186085ba7357SStefano Zampini /* create cusparse matrix */ 1861abb89eb1SStefano Zampini cusparsestruct->nrows = m; 1862aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 18639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 18649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 18659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 18669ae82921SPaul Mullowney 18679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 18689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 18699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 18709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 18739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 1874b06137fdSPaul Mullowney 1875aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1876aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1877aa372e3fSPaul Mullowney /* set the matrix */ 1878afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1879afb2bd1cSJunchao Zhang mat->num_rows = m; 1880afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1881abb89eb1SStefano Zampini mat->num_entries = nnz; 1882afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1883afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 18849ae82921SPaul Mullowney 1885abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1886abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1887aa372e3fSPaul Mullowney 1888abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1889abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1890aa372e3fSPaul Mullowney 1891aa372e3fSPaul Mullowney /* assign the pointer */ 1892afb2bd1cSJunchao Zhang matstruct->mat = mat; 1893afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1894afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1895afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1896afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1897afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1898afb2bd1cSJunchao Zhang mat->values->data().get(), 1899afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 19009566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 1901afb2bd1cSJunchao Zhang } 1902afb2bd1cSJunchao Zhang #endif 1903aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1904afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1905afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1906afb2bd1cSJunchao Zhang #else 1907afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1908afb2bd1cSJunchao Zhang mat->num_rows = m; 1909afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1910abb89eb1SStefano Zampini mat->num_entries = nnz; 1911afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1912afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1913aa372e3fSPaul Mullowney 1914abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 1915abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 1916aa372e3fSPaul Mullowney 1917abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 1918abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 1919aa372e3fSPaul Mullowney 1920aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 19219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1922aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1923aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1924afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1925afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1926afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1927afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 19289566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1929aa372e3fSPaul Mullowney /* assign the pointer */ 1930aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1931aa372e3fSPaul Mullowney 1932afb2bd1cSJunchao Zhang if (mat) { 1933afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1934afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1935afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1936afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1937087f3262SPaul Mullowney } 1938afb2bd1cSJunchao Zhang #endif 1939087f3262SPaul Mullowney } 1940ca45077fSPaul Mullowney 1941aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1942213423ffSJunchao Zhang if (a->compressedrow.use) { 1943213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1944aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1945aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1946213423ffSJunchao Zhang tmp = m; 1947213423ffSJunchao Zhang } else { 1948213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1949213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1950213423ffSJunchao Zhang tmp = 0; 1951213423ffSJunchao Zhang } 19529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 1953aa372e3fSPaul Mullowney 1954aa372e3fSPaul Mullowney /* assign the pointer */ 1955aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 19569ae82921SPaul Mullowney } catch(char *ex) { 195798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 19589ae82921SPaul Mullowney } 19599566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 19609566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 196134d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 196234d6c7a5SJose E. Roman } 1963abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 19649ae82921SPaul Mullowney } 19659ae82921SPaul Mullowney PetscFunctionReturn(0); 19669ae82921SPaul Mullowney } 19679ae82921SPaul Mullowney 1968c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1969aa372e3fSPaul Mullowney { 1970aa372e3fSPaul Mullowney template <typename Tuple> 1971aa372e3fSPaul Mullowney __host__ __device__ 1972aa372e3fSPaul Mullowney void operator()(Tuple t) 1973aa372e3fSPaul Mullowney { 1974aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1975aa372e3fSPaul Mullowney } 1976aa372e3fSPaul Mullowney }; 1977aa372e3fSPaul Mullowney 19787e8381f9SStefano Zampini struct VecCUDAEquals 19797e8381f9SStefano Zampini { 19807e8381f9SStefano Zampini template <typename Tuple> 19817e8381f9SStefano Zampini __host__ __device__ 19827e8381f9SStefano Zampini void operator()(Tuple t) 19837e8381f9SStefano Zampini { 19847e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 19857e8381f9SStefano Zampini } 19867e8381f9SStefano Zampini }; 19877e8381f9SStefano Zampini 1988e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1989e6e9a74fSStefano Zampini { 1990e6e9a74fSStefano Zampini template <typename Tuple> 1991e6e9a74fSStefano Zampini __host__ __device__ 1992e6e9a74fSStefano Zampini void operator()(Tuple t) 1993e6e9a74fSStefano Zampini { 1994e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1995e6e9a74fSStefano Zampini } 1996e6e9a74fSStefano Zampini }; 1997e6e9a74fSStefano Zampini 1998afb2bd1cSJunchao Zhang struct MatMatCusparse { 1999ccdfe979SStefano Zampini PetscBool cisdense; 2000ccdfe979SStefano Zampini PetscScalar *Bt; 2001ccdfe979SStefano Zampini Mat X; 2002fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2003fcdce8c4SStefano Zampini PetscLogDouble flops; 2004fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2005b4285af6SJunchao Zhang 2006afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2007fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2008afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2009afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2010afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2011afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2012b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2013b4285af6SJunchao Zhang void *dBuffer4; 2014b4285af6SJunchao Zhang void *dBuffer5; 2015b4285af6SJunchao Zhang #endif 2016fcdce8c4SStefano Zampini size_t mmBufferSize; 2017fcdce8c4SStefano Zampini void *mmBuffer; 2018fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2019fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2020afb2bd1cSJunchao Zhang #endif 2021afb2bd1cSJunchao Zhang }; 2022ccdfe979SStefano Zampini 2023ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2024ccdfe979SStefano Zampini { 2025ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2026ccdfe979SStefano Zampini 2027ccdfe979SStefano Zampini PetscFunctionBegin; 20289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2029fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 20319566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 20329566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 20339566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 20349566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2035b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 20369566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 20379566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2038b4285af6SJunchao Zhang #endif 20399566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 20409566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2041afb2bd1cSJunchao Zhang #endif 20429566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 20439566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2044ccdfe979SStefano Zampini PetscFunctionReturn(0); 2045ccdfe979SStefano Zampini } 2046ccdfe979SStefano Zampini 2047ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2048ccdfe979SStefano Zampini 2049ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2050ccdfe979SStefano Zampini { 2051ccdfe979SStefano Zampini Mat_Product *product = C->product; 2052ccdfe979SStefano Zampini Mat A,B; 2053afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2054ccdfe979SStefano Zampini PetscBool flg,biscuda; 2055ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2056ccdfe979SStefano Zampini cusparseStatus_t stat; 2057ccdfe979SStefano Zampini cusparseOperation_t opA; 2058ccdfe979SStefano Zampini const PetscScalar *barray; 2059ccdfe979SStefano Zampini PetscScalar *carray; 2060ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2061ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2062ccdfe979SStefano Zampini CsrMatrix *csrmat; 2063ccdfe979SStefano Zampini 2064ccdfe979SStefano Zampini PetscFunctionBegin; 2065ccdfe979SStefano Zampini MatCheckProduct(C,1); 206628b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2067ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2068ccdfe979SStefano Zampini A = product->A; 2069ccdfe979SStefano Zampini B = product->B; 20709566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 207128b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2072ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2073ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 207428b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 20759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2076ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2077ccdfe979SStefano Zampini switch (product->type) { 2078ccdfe979SStefano Zampini case MATPRODUCT_AB: 2079ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2080ccdfe979SStefano Zampini mat = cusp->mat; 2081ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2082ccdfe979SStefano Zampini m = A->rmap->n; 2083ccdfe979SStefano Zampini n = B->cmap->n; 2084ccdfe979SStefano Zampini break; 2085ccdfe979SStefano Zampini case MATPRODUCT_AtB: 20861a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2087e6e9a74fSStefano Zampini mat = cusp->mat; 2088e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2089e6e9a74fSStefano Zampini } else { 20909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2091ccdfe979SStefano Zampini mat = cusp->matTranspose; 2092ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2093e6e9a74fSStefano Zampini } 2094ccdfe979SStefano Zampini m = A->cmap->n; 2095ccdfe979SStefano Zampini n = B->cmap->n; 2096ccdfe979SStefano Zampini break; 2097ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2098ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2099ccdfe979SStefano Zampini mat = cusp->mat; 2100ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2101ccdfe979SStefano Zampini m = A->rmap->n; 2102ccdfe979SStefano Zampini n = B->rmap->n; 2103ccdfe979SStefano Zampini break; 2104ccdfe979SStefano Zampini default: 210598921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2106ccdfe979SStefano Zampini } 210728b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2108ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2109ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 21109566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 21119566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 21129566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2113afb2bd1cSJunchao Zhang 21149566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B,&blda)); 2115c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 21169566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 21179566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2118c8378d12SStefano Zampini } else { 21199566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 21209566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C,&clda)); 2121c8378d12SStefano Zampini } 2122c8378d12SStefano Zampini 21239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2125afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2126a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2127afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2128fcdce8c4SStefano Zampini size_t mmBufferSize; 21299566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2130afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 21319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2132afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2133afb2bd1cSJunchao Zhang } 2134c8378d12SStefano Zampini 21359566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2136afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 21379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2138afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2139afb2bd1cSJunchao Zhang } 2140afb2bd1cSJunchao Zhang 2141afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2142afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2143afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2144afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2145afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2146afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 21479566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2148afb2bd1cSJunchao Zhang } 2149afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2150afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2151afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21529566063dSJacob Faibussowitsch cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2153fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 21549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 21559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2156fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2157fcdce8c4SStefano Zampini } 2158afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2159afb2bd1cSJunchao Zhang } else { 2160afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 21619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 21629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 21639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2164afb2bd1cSJunchao Zhang } 2165afb2bd1cSJunchao Zhang 2166afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2167afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2168afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2169afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 21709566063dSJacob Faibussowitsch cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2171afb2bd1cSJunchao Zhang #else 2172afb2bd1cSJunchao Zhang PetscInt k; 2173afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2174ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2175ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2176ccdfe979SStefano Zampini cublasStatus_t cerr; 2177ccdfe979SStefano Zampini 21789566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2179ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2180ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2181ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2182ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 21839566063dSJacob Faibussowitsch mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2184ccdfe979SStefano Zampini blda = B->cmap->n; 2185afb2bd1cSJunchao Zhang k = B->cmap->n; 2186afb2bd1cSJunchao Zhang } else { 2187afb2bd1cSJunchao Zhang k = B->rmap->n; 2188ccdfe979SStefano Zampini } 2189ccdfe979SStefano Zampini 2190afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2191ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2192afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2193ccdfe979SStefano Zampini csrmat->values->data().get(), 2194ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2195ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2196ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 21979566063dSJacob Faibussowitsch carray,clda);PetscCallCUSPARSE(stat); 2198afb2bd1cSJunchao Zhang #endif 21999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 22009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 22019566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2202ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 22039566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22049566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2205ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 22069566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 22079566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2208ccdfe979SStefano Zampini } else { 22099566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2210ccdfe979SStefano Zampini } 2211ccdfe979SStefano Zampini if (mmdata->cisdense) { 22129566063dSJacob Faibussowitsch PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2213ccdfe979SStefano Zampini } 2214ccdfe979SStefano Zampini if (!biscuda) { 22159566063dSJacob Faibussowitsch PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2216ccdfe979SStefano Zampini } 2217ccdfe979SStefano Zampini PetscFunctionReturn(0); 2218ccdfe979SStefano Zampini } 2219ccdfe979SStefano Zampini 2220ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2221ccdfe979SStefano Zampini { 2222ccdfe979SStefano Zampini Mat_Product *product = C->product; 2223ccdfe979SStefano Zampini Mat A,B; 2224ccdfe979SStefano Zampini PetscInt m,n; 2225ccdfe979SStefano Zampini PetscBool cisdense,flg; 2226ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2227ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2228ccdfe979SStefano Zampini 2229ccdfe979SStefano Zampini PetscFunctionBegin; 2230ccdfe979SStefano Zampini MatCheckProduct(C,1); 223128b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2232ccdfe979SStefano Zampini A = product->A; 2233ccdfe979SStefano Zampini B = product->B; 22349566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 223528b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2236ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 223708401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2238ccdfe979SStefano Zampini switch (product->type) { 2239ccdfe979SStefano Zampini case MATPRODUCT_AB: 2240ccdfe979SStefano Zampini m = A->rmap->n; 2241ccdfe979SStefano Zampini n = B->cmap->n; 2242ccdfe979SStefano Zampini break; 2243ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2244ccdfe979SStefano Zampini m = A->cmap->n; 2245ccdfe979SStefano Zampini n = B->cmap->n; 2246ccdfe979SStefano Zampini break; 2247ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2248ccdfe979SStefano Zampini m = A->rmap->n; 2249ccdfe979SStefano Zampini n = B->rmap->n; 2250ccdfe979SStefano Zampini break; 2251ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2252ccdfe979SStefano Zampini m = B->cmap->n; 2253ccdfe979SStefano Zampini n = B->cmap->n; 2254ccdfe979SStefano Zampini break; 2255ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2256ccdfe979SStefano Zampini m = B->rmap->n; 2257ccdfe979SStefano Zampini n = B->rmap->n; 2258ccdfe979SStefano Zampini break; 2259ccdfe979SStefano Zampini default: 226098921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2261ccdfe979SStefano Zampini } 22629566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 2263ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 22649566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 22659566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQDENSECUDA)); 2266ccdfe979SStefano Zampini 2267ccdfe979SStefano Zampini /* product data */ 22689566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2269ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2270afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2271afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2272ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 22739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 2274ccdfe979SStefano Zampini } 2275afb2bd1cSJunchao Zhang #endif 2276ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2277ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 22789566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 22799566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 2280ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 22819566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 2282ccdfe979SStefano Zampini } else { 22839566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 2284ccdfe979SStefano Zampini } 2285ccdfe979SStefano Zampini } 2286ccdfe979SStefano Zampini C->product->data = mmdata; 2287ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2288ccdfe979SStefano Zampini 2289ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2290ccdfe979SStefano Zampini PetscFunctionReturn(0); 2291ccdfe979SStefano Zampini } 2292ccdfe979SStefano Zampini 2293fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2294ccdfe979SStefano Zampini { 2295ccdfe979SStefano Zampini Mat_Product *product = C->product; 2296fcdce8c4SStefano Zampini Mat A,B; 2297fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2298fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2299fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2300fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2301fcdce8c4SStefano Zampini PetscBool flg; 2302fcdce8c4SStefano Zampini cusparseStatus_t stat; 2303fcdce8c4SStefano Zampini MatProductType ptype; 2304fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2305fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2306fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2307fcdce8c4SStefano Zampini #endif 2308b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2309ccdfe979SStefano Zampini 2310ccdfe979SStefano Zampini PetscFunctionBegin; 2311ccdfe979SStefano Zampini MatCheckProduct(C,1); 231228b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 23139566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 231428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 2315fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2316fcdce8c4SStefano Zampini A = product->A; 2317fcdce8c4SStefano Zampini B = product->B; 2318fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2319fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2320fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 232108401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2322fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 232328b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2324fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 232528b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 2326fcdce8c4SStefano Zampini goto finalize; 2327fcdce8c4SStefano Zampini } 2328fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 23299566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 233028b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 23319566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 233228b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 233328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 233428b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2335fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2336fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2337fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 233808401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 233908401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 234008401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 23419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2343fcdce8c4SStefano Zampini 2344fcdce8c4SStefano Zampini ptype = product->type; 2345fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2346fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 234728b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 2348fa046f9fSJunchao Zhang } 2349fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2350fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 235128b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 2352fa046f9fSJunchao Zhang } 2353fcdce8c4SStefano Zampini switch (ptype) { 2354fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2355fcdce8c4SStefano Zampini Amat = Acusp->mat; 2356fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2357fcdce8c4SStefano Zampini break; 2358fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2359fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2360fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2361fcdce8c4SStefano Zampini break; 2362fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2363fcdce8c4SStefano Zampini Amat = Acusp->mat; 2364fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2365fcdce8c4SStefano Zampini break; 2366fcdce8c4SStefano Zampini default: 236798921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2368fcdce8c4SStefano Zampini } 2369fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 237028b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 237128b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 237228b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2373fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2374fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2375fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 237628b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 237728b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 237828b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 23799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2380fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2381fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 23829566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2383b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2384b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2385b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2386b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23879566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2388b4285af6SJunchao Zhang #else 2389b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2390fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2391fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 23929566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2393b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2394fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 23959566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2396b4285af6SJunchao Zhang #endif 2397fcdce8c4SStefano Zampini #else 2398b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2399fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2400fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2401fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 24029566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2403fcdce8c4SStefano Zampini #endif 24049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 24059566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 24069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2407fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2408fcdce8c4SStefano Zampini finalize: 2409fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 24109566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 24119566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 24129566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 2413fcdce8c4SStefano Zampini c->reallocs = 0; 2414fcdce8c4SStefano Zampini C->info.mallocs += 0; 2415fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2416fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2417fcdce8c4SStefano Zampini C->num_ass++; 2418ccdfe979SStefano Zampini PetscFunctionReturn(0); 2419ccdfe979SStefano Zampini } 2420fcdce8c4SStefano Zampini 2421fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2422fcdce8c4SStefano Zampini { 2423fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2424fcdce8c4SStefano Zampini Mat A,B; 2425fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2426fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2427fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2428fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2429fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2430fcdce8c4SStefano Zampini PetscBool flg; 2431fcdce8c4SStefano Zampini cusparseStatus_t stat; 2432fcdce8c4SStefano Zampini MatProductType ptype; 2433fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2434fcdce8c4SStefano Zampini PetscLogDouble flops; 2435fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2436fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2437fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2438fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2439fcdce8c4SStefano Zampini #else 2440fcdce8c4SStefano Zampini int cnz; 2441fcdce8c4SStefano Zampini #endif 2442b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2443fcdce8c4SStefano Zampini 2444fcdce8c4SStefano Zampini PetscFunctionBegin; 2445fcdce8c4SStefano Zampini MatCheckProduct(C,1); 244628b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2447fcdce8c4SStefano Zampini A = product->A; 2448fcdce8c4SStefano Zampini B = product->B; 24499566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 245028b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 24519566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 245228b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 2453fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2454fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2455fcdce8c4SStefano Zampini /* product data */ 24569566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2457fcdce8c4SStefano Zampini C->product->data = mmdata; 2458fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2459fcdce8c4SStefano Zampini 24609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 24619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2462d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2463d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 246408401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 246508401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2466d60bce21SJunchao Zhang 2467fcdce8c4SStefano Zampini ptype = product->type; 2468fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 2469fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2470fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2471fa046f9fSJunchao Zhang } 2472fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 2473fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2474fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2475fa046f9fSJunchao Zhang } 2476fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2477fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2478fcdce8c4SStefano Zampini switch (ptype) { 2479fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2480fcdce8c4SStefano Zampini m = A->rmap->n; 2481fcdce8c4SStefano Zampini n = B->cmap->n; 2482fcdce8c4SStefano Zampini k = A->cmap->n; 2483fcdce8c4SStefano Zampini Amat = Acusp->mat; 2484fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2485fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2486fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2487fcdce8c4SStefano Zampini break; 2488fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2489fcdce8c4SStefano Zampini m = A->cmap->n; 2490fcdce8c4SStefano Zampini n = B->cmap->n; 2491fcdce8c4SStefano Zampini k = A->rmap->n; 24929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2493fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2494fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2495fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2496fcdce8c4SStefano Zampini break; 2497fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2498fcdce8c4SStefano Zampini m = A->rmap->n; 2499fcdce8c4SStefano Zampini n = B->rmap->n; 2500fcdce8c4SStefano Zampini k = A->cmap->n; 25019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2502fcdce8c4SStefano Zampini Amat = Acusp->mat; 2503fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2504fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2505fcdce8c4SStefano Zampini break; 2506fcdce8c4SStefano Zampini default: 250798921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2508fcdce8c4SStefano Zampini } 2509fcdce8c4SStefano Zampini 2510fcdce8c4SStefano Zampini /* create cusparse matrix */ 25119566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 25129566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 2513fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2514fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2515fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2516fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2517fcdce8c4SStefano Zampini 2518fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2519fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2520fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 25219566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 25229566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 2523fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2524fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2525fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2526fcdce8c4SStefano Zampini } else { 2527fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2528fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2529fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2530fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2531fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2532fcdce8c4SStefano Zampini } 2533fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2534fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2535fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2536fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2537fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2538fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 25399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 25409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 25419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 25429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 25439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 25449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 25459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 25479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 2548fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2549fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2550fcdce8c4SStefano Zampini c->nz = 0; 2551fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2552fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2553fcdce8c4SStefano Zampini goto finalizesym; 2554fcdce8c4SStefano Zampini } 2555fcdce8c4SStefano Zampini 255628b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 255728b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2558fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2559fcdce8c4SStefano Zampini if (!biscompressed) { 2560fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2561fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2562fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2563fcdce8c4SStefano Zampini #endif 2564fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2565fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2566fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2567fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2568fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2569fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2570fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2571fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2572fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2573fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2574fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 25759566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 2576fcdce8c4SStefano Zampini } 2577fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2578fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2579fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2580fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2581fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2582fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2583fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2584fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 25859566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 2586fcdce8c4SStefano Zampini } 2587fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2588fcdce8c4SStefano Zampini #endif 2589fcdce8c4SStefano Zampini } 259028b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 259128b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 2592fcdce8c4SStefano Zampini /* precompute flops count */ 2593fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2594fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2595fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2596fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2597fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2598fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2599fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2600fcdce8c4SStefano Zampini } 2601fcdce8c4SStefano Zampini } 2602fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2603fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2604fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2605fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2606fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2607fcdce8c4SStefano Zampini } 2608fcdce8c4SStefano Zampini } else { /* TODO */ 2609fcdce8c4SStefano Zampini flops = 0.; 2610fcdce8c4SStefano Zampini } 2611fcdce8c4SStefano Zampini 2612fcdce8c4SStefano Zampini mmdata->flops = flops; 26139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2614b4285af6SJunchao Zhang 2615fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 26169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2617fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2618fcdce8c4SStefano Zampini NULL, NULL, NULL, 2619fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 26209566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 26219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2622b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2623b4285af6SJunchao Zhang { 2624b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2625b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2626b4285af6SJunchao Zhang */ 2627b4285af6SJunchao Zhang void* dBuffer1 = NULL; 2628b4285af6SJunchao Zhang void* dBuffer2 = NULL; 2629b4285af6SJunchao Zhang void* dBuffer3 = NULL; 2630b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2631b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2632b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2633b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2634b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2635b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2636b4285af6SJunchao Zhang 2637b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2638b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 2639b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2640b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26419566063dSJacob Faibussowitsch &bufferSize1, NULL);PetscCallCUSPARSE(stat); 26429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 2643b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 2644b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2645b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26469566063dSJacob Faibussowitsch &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 2647b4285af6SJunchao Zhang 2648b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2649b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2650b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26519566063dSJacob Faibussowitsch &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 26529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 26539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 26549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 2655b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2656b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26579566063dSJacob Faibussowitsch &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 26589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 26599566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 2660b4285af6SJunchao Zhang 2661b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2662b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 26639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2664b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 2665b4285af6SJunchao Zhang /* allocate matrix C */ 26669566063dSJacob Faibussowitsch Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 26679566063dSJacob Faibussowitsch Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2668b4285af6SJunchao Zhang /* update matC with the new pointers */ 2669b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 26709566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2671b4285af6SJunchao Zhang 2672b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2673b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2674b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26759566063dSJacob Faibussowitsch &bufferSize5, NULL);PetscCallCUSPARSE(stat); 26769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 2677b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 2678b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 26799566063dSJacob Faibussowitsch &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 26809566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 2681b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 2682b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2683b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26849566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 26859566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 2686b4285af6SJunchao Zhang } 2687ae37ee31SJunchao Zhang #else 2688b4285af6SJunchao Zhang size_t bufSize2; 2689fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2690b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2691fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2692fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26939566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 26949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 2695fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2696b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 2697fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2698fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 26999566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 2700fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2701b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2702fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2703fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27049566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 2705fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2706fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2707fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2708fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2709fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 27109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 2711fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2712b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 2713fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2714fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 27159566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2716fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 27179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2718fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 27199566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 2720fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27219566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2722fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27239566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2724fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 27259566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 2726b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 2727fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 27289566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 2729ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2730fcdce8c4SStefano Zampini #else 27319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 2732b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 2733fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2734fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2735fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27369566063dSJacob Faibussowitsch Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 2737fcdce8c4SStefano Zampini c->nz = cnz; 2738fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 27399566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2740fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 27419566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2742fcdce8c4SStefano Zampini 27439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2744fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2745fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2746fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2747b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 2748fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2749fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2750fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 27519566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 2752fcdce8c4SStefano Zampini #endif 27539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2755fcdce8c4SStefano Zampini finalizesym: 2756fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2757fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2758fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 27599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 27609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 2761fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2762fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2763fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2764fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2765fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2766fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2767fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2770fcdce8c4SStefano Zampini } else { 2771fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2772fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 27739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 27749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 2775fcdce8c4SStefano Zampini } 2776fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2777fcdce8c4SStefano Zampini PetscInt r = 0; 2778fcdce8c4SStefano Zampini c->i[0] = 0; 2779fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2780fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2781fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2782fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2783fcdce8c4SStefano Zampini } 2784fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2785fcdce8c4SStefano Zampini } 27869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 27879566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 27889566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 2789fcdce8c4SStefano Zampini c->maxnz = c->nz; 2790fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2791fcdce8c4SStefano Zampini c->rmax = 0; 2792fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2793fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2794fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2795fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2796fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2797fcdce8c4SStefano Zampini } 27989566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 27999566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 2800fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2801fcdce8c4SStefano Zampini 2802fcdce8c4SStefano Zampini C->nonzerostate++; 28039566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 28049566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 2805fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2806fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2807fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2808fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2809fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2810abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2811fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2812fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2813fcdce8c4SStefano Zampini } 2814fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2815fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2816fcdce8c4SStefano Zampini } 2817fcdce8c4SStefano Zampini 2818fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2819fcdce8c4SStefano Zampini 2820fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2821fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2822fcdce8c4SStefano Zampini { 2823fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2824fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2825fcdce8c4SStefano Zampini 2826fcdce8c4SStefano Zampini PetscFunctionBegin; 2827fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 28289566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 2829abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 28309566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 2831fcdce8c4SStefano Zampini } 2832fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2833fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2834fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 28359566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 2836fcdce8c4SStefano Zampini } 2837fcdce8c4SStefano Zampini } 283865e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 283965e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 284065e4b4d4SStefano Zampini switch (product->type) { 284165e4b4d4SStefano Zampini case MATPRODUCT_AB: 284265e4b4d4SStefano Zampini if (product->api_user) { 2843d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 28449566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2845d0609cedSBarry Smith PetscOptionsEnd(); 284665e4b4d4SStefano Zampini } else { 2847d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 28489566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 2849d0609cedSBarry Smith PetscOptionsEnd(); 285065e4b4d4SStefano Zampini } 285165e4b4d4SStefano Zampini break; 285265e4b4d4SStefano Zampini case MATPRODUCT_AtB: 285365e4b4d4SStefano Zampini if (product->api_user) { 2854d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 28559566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2856d0609cedSBarry Smith PetscOptionsEnd(); 285765e4b4d4SStefano Zampini } else { 2858d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 28599566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 2860d0609cedSBarry Smith PetscOptionsEnd(); 286165e4b4d4SStefano Zampini } 286265e4b4d4SStefano Zampini break; 286365e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 286465e4b4d4SStefano Zampini if (product->api_user) { 2865d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 28669566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2867d0609cedSBarry Smith PetscOptionsEnd(); 286865e4b4d4SStefano Zampini } else { 2869d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 28709566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 2871d0609cedSBarry Smith PetscOptionsEnd(); 287265e4b4d4SStefano Zampini } 287365e4b4d4SStefano Zampini break; 287465e4b4d4SStefano Zampini case MATPRODUCT_RARt: 287565e4b4d4SStefano Zampini if (product->api_user) { 2876d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 28779566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2878d0609cedSBarry Smith PetscOptionsEnd(); 287965e4b4d4SStefano Zampini } else { 2880d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 28819566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 2882d0609cedSBarry Smith PetscOptionsEnd(); 288365e4b4d4SStefano Zampini } 288465e4b4d4SStefano Zampini break; 288565e4b4d4SStefano Zampini case MATPRODUCT_ABC: 288665e4b4d4SStefano Zampini if (product->api_user) { 2887d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 28889566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2889d0609cedSBarry Smith PetscOptionsEnd(); 289065e4b4d4SStefano Zampini } else { 2891d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 28929566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 2893d0609cedSBarry Smith PetscOptionsEnd(); 289465e4b4d4SStefano Zampini } 289565e4b4d4SStefano Zampini break; 289665e4b4d4SStefano Zampini default: 289765e4b4d4SStefano Zampini break; 289865e4b4d4SStefano Zampini } 289965e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 290065e4b4d4SStefano Zampini } 290165e4b4d4SStefano Zampini /* dispatch */ 2902fcdce8c4SStefano Zampini if (isdense) { 2903ccdfe979SStefano Zampini switch (product->type) { 2904ccdfe979SStefano Zampini case MATPRODUCT_AB: 2905ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2906ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2907ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2908ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2909fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 29109566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 2911fcdce8c4SStefano Zampini } else { 2912fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2913fcdce8c4SStefano Zampini } 2914fcdce8c4SStefano Zampini break; 2915fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2916fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2917fcdce8c4SStefano Zampini break; 2918ccdfe979SStefano Zampini default: 2919ccdfe979SStefano Zampini break; 2920ccdfe979SStefano Zampini } 2921fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2922fcdce8c4SStefano Zampini switch (product->type) { 2923fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2924fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2925fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2926fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2927fcdce8c4SStefano Zampini break; 2928fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2929fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2930fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2931fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2932fcdce8c4SStefano Zampini break; 2933fcdce8c4SStefano Zampini default: 2934fcdce8c4SStefano Zampini break; 2935fcdce8c4SStefano Zampini } 2936fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 29379566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 2938fcdce8c4SStefano Zampini } 2939ccdfe979SStefano Zampini PetscFunctionReturn(0); 2940ccdfe979SStefano Zampini } 2941ccdfe979SStefano Zampini 29426fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 29439ae82921SPaul Mullowney { 29449ae82921SPaul Mullowney PetscFunctionBegin; 29459566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 2946e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2947e6e9a74fSStefano Zampini } 2948e6e9a74fSStefano Zampini 2949e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2950e6e9a74fSStefano Zampini { 2951e6e9a74fSStefano Zampini PetscFunctionBegin; 29529566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 2953e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2954e6e9a74fSStefano Zampini } 2955e6e9a74fSStefano Zampini 2956e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2957e6e9a74fSStefano Zampini { 2958e6e9a74fSStefano Zampini PetscFunctionBegin; 29599566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 2960e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2961e6e9a74fSStefano Zampini } 2962e6e9a74fSStefano Zampini 2963e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2964e6e9a74fSStefano Zampini { 2965e6e9a74fSStefano Zampini PetscFunctionBegin; 29669566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 29679ae82921SPaul Mullowney PetscFunctionReturn(0); 29689ae82921SPaul Mullowney } 29699ae82921SPaul Mullowney 29706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2971ca45077fSPaul Mullowney { 2972ca45077fSPaul Mullowney PetscFunctionBegin; 29739566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 2974ca45077fSPaul Mullowney PetscFunctionReturn(0); 2975ca45077fSPaul Mullowney } 2976ca45077fSPaul Mullowney 2977a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 2978a0e72f99SJunchao Zhang { 2979a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 2980a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 2981a0e72f99SJunchao Zhang } 2982a0e72f99SJunchao Zhang 2983afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2984e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 29859ae82921SPaul Mullowney { 29869ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2987aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 29889ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2989e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2990e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2991e6e9a74fSStefano Zampini PetscBool compressed; 2992afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2993afb2bd1cSJunchao Zhang PetscInt nx,ny; 2994afb2bd1cSJunchao Zhang #endif 29956e111a19SKarl Rupp 29969ae82921SPaul Mullowney PetscFunctionBegin; 299708401ef6SPierre Jolivet PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 2998cbc6b225SStefano Zampini if (!a->nz) { 29999566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 30009566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3001e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3002e6e9a74fSStefano Zampini } 300334d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 30049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3005e6e9a74fSStefano Zampini if (!trans) { 30069ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 30075f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3008e6e9a74fSStefano Zampini } else { 30091a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3010e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3011e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3012e6e9a74fSStefano Zampini } else { 30139566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3014e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3015e6e9a74fSStefano Zampini } 3016e6e9a74fSStefano Zampini } 3017e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3018e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3019213423ffSJunchao Zhang 3020e6e9a74fSStefano Zampini try { 30219566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 30229566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 30239566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3024afb2bd1cSJunchao Zhang 30259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3026e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3027afb2bd1cSJunchao Zhang /* z = A x + beta y. 3028afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3029afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3030afb2bd1cSJunchao Zhang */ 3031e6e9a74fSStefano Zampini xptr = xarray; 3032afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3033213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3034afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3035afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3036afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3037afb2bd1cSJunchao Zhang */ 3038afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3039afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3040afb2bd1cSJunchao Zhang nx = mat->num_cols; 3041afb2bd1cSJunchao Zhang ny = mat->num_rows; 3042afb2bd1cSJunchao Zhang } 3043afb2bd1cSJunchao Zhang #endif 3044e6e9a74fSStefano Zampini } else { 3045afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3046afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3047afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3048afb2bd1cSJunchao Zhang */ 3049afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3050e6e9a74fSStefano Zampini dptr = zarray; 3051e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3052afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3053e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3054a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3055e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3056e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3057e6e9a74fSStefano Zampini } 3058afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3059afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3060afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3061afb2bd1cSJunchao Zhang nx = mat->num_rows; 3062afb2bd1cSJunchao Zhang ny = mat->num_cols; 3063afb2bd1cSJunchao Zhang } 3064afb2bd1cSJunchao Zhang #endif 3065e6e9a74fSStefano Zampini } 30669ae82921SPaul Mullowney 3067afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3068aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3069afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 30705f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3071afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 30729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 30739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 30749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3075afb2bd1cSJunchao Zhang matstruct->matDescr, 3076afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3077afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3078afb2bd1cSJunchao Zhang cusparse_scalartype, 3079afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 30805f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 30819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3082afb2bd1cSJunchao Zhang 3083afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3084afb2bd1cSJunchao Zhang } else { 3085afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 30869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 30879566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3088afb2bd1cSJunchao Zhang } 3089afb2bd1cSJunchao Zhang 30909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3091afb2bd1cSJunchao Zhang matstruct->alpha_one, 30923606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3093afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3094afb2bd1cSJunchao Zhang beta, 3095afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3096afb2bd1cSJunchao Zhang cusparse_scalartype, 3097afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 30985f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3099afb2bd1cSJunchao Zhang #else 31007656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 31019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3102a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3103afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3104aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3105e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 31065f80ce2aSJacob Faibussowitsch dptr)); 3107afb2bd1cSJunchao Zhang #endif 3108aa372e3fSPaul Mullowney } else { 3109213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3110afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3111afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3112afb2bd1cSJunchao Zhang #else 3113301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 31149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3115afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3116e6e9a74fSStefano Zampini xptr, beta, 31175f80ce2aSJacob Faibussowitsch dptr)); 3118afb2bd1cSJunchao Zhang #endif 3119a65300a6SPaul Mullowney } 3120aa372e3fSPaul Mullowney } 31219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3122aa372e3fSPaul Mullowney 3123e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3124213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3125213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 31269566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3127e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 31289566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 31297656d835SStefano Zampini } 3130213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 31319566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz,0)); 31327656d835SStefano Zampini } 31337656d835SStefano Zampini 3134213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3135213423ffSJunchao Zhang if (compressed) { 31369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3137a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3138a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3139a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3140a0e72f99SJunchao Zhang */ 3141a0e72f99SJunchao Zhang #if 0 3142a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3143a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3144a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3145e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3146c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3147a0e72f99SJunchao Zhang #else 3148a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3149a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3150a0e72f99SJunchao Zhang #endif 31519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3152e6e9a74fSStefano Zampini } 3153e6e9a74fSStefano Zampini } else { 3154e6e9a74fSStefano Zampini if (yy && yy != zz) { 31559566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3156e6e9a74fSStefano Zampini } 3157e6e9a74fSStefano Zampini } 31589566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 31599566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 31609566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 31619ae82921SPaul Mullowney } catch(char *ex) { 316298921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 31639ae82921SPaul Mullowney } 3164e6e9a74fSStefano Zampini if (yy) { 31659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3166e6e9a74fSStefano Zampini } else { 31679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3168e6e9a74fSStefano Zampini } 31699ae82921SPaul Mullowney PetscFunctionReturn(0); 31709ae82921SPaul Mullowney } 31719ae82921SPaul Mullowney 31726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3173ca45077fSPaul Mullowney { 3174ca45077fSPaul Mullowney PetscFunctionBegin; 31759566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3176ca45077fSPaul Mullowney PetscFunctionReturn(0); 3177ca45077fSPaul Mullowney } 3178ca45077fSPaul Mullowney 31796fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 31809ae82921SPaul Mullowney { 3181042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3182042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 31833fa6b06aSMark Adams 3184042217e8SBarry Smith PetscFunctionBegin; 31859566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3186042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3187042217e8SBarry Smith 31889566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 31899566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3190042217e8SBarry Smith cusp->deviceMat = NULL; 3191042217e8SBarry Smith } 31929ae82921SPaul Mullowney PetscFunctionReturn(0); 31939ae82921SPaul Mullowney } 31949ae82921SPaul Mullowney 31959ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3196e057df02SPaul Mullowney /*@ 31979ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3198e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3199e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3200e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3201e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3202e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 32039ae82921SPaul Mullowney 3204d083f849SBarry Smith Collective 32059ae82921SPaul Mullowney 32069ae82921SPaul Mullowney Input Parameters: 32079ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 32089ae82921SPaul Mullowney . m - number of rows 32099ae82921SPaul Mullowney . n - number of columns 32109ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 32119ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 32120298fd71SBarry Smith (possibly different for each row) or NULL 32139ae82921SPaul Mullowney 32149ae82921SPaul Mullowney Output Parameter: 32159ae82921SPaul Mullowney . A - the matrix 32169ae82921SPaul Mullowney 32179ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 32189ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 32199ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 32209ae82921SPaul Mullowney 32219ae82921SPaul Mullowney Notes: 32229ae82921SPaul Mullowney If nnz is given then nz is ignored 32239ae82921SPaul Mullowney 32249ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 32259ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 32269ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 32279ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 32289ae82921SPaul Mullowney 32299ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 32300298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 32319ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 32329ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 32339ae82921SPaul Mullowney 32349ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 32359ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 32369ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 32379ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 32389ae82921SPaul Mullowney 32399ae82921SPaul Mullowney Level: intermediate 32409ae82921SPaul Mullowney 3241db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 32429ae82921SPaul Mullowney @*/ 32439ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 32449ae82921SPaul Mullowney { 32459ae82921SPaul Mullowney PetscFunctionBegin; 32469566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 32479566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 32489566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 32499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 32509ae82921SPaul Mullowney PetscFunctionReturn(0); 32519ae82921SPaul Mullowney } 32529ae82921SPaul Mullowney 32536fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 32549ae82921SPaul Mullowney { 32559ae82921SPaul Mullowney PetscFunctionBegin; 32569ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 32579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 32589ae82921SPaul Mullowney } else { 32599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3260aa372e3fSPaul Mullowney } 32619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 32629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 32639566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 32649566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 32659566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 32669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 32679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 32689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 32699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 32709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 32719566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 32729ae82921SPaul Mullowney PetscFunctionReturn(0); 32739ae82921SPaul Mullowney } 32749ae82921SPaul Mullowney 3275ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 327695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 32779ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 32789ff858a8SKarl Rupp { 32799ff858a8SKarl Rupp PetscFunctionBegin; 32809566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 32819566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 32829ff858a8SKarl Rupp PetscFunctionReturn(0); 32839ff858a8SKarl Rupp } 32849ff858a8SKarl Rupp 3285039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 328695639643SRichard Tran Mills { 3287a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3288039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3289039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3290039c6fbaSStefano Zampini PetscScalar *ay; 3291039c6fbaSStefano Zampini const PetscScalar *ax; 3292039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 3293e6e9a74fSStefano Zampini 329495639643SRichard Tran Mills PetscFunctionBegin; 3295a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3296a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 3297039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 32989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 32999566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3300a587d139SMark PetscFunctionReturn(0); 330195639643SRichard Tran Mills } 3302039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 33039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 33049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 33055f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 33065f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 3307039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 3308039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 3309039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3310039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3311039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 3312039c6fbaSStefano Zampini if (eq) { 3313039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 3314039c6fbaSStefano Zampini } 3315039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3316039c6fbaSStefano Zampini } 3317d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3318d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3319039c6fbaSStefano Zampini 3320039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3321039c6fbaSStefano Zampini PetscScalar b = 1.0; 3322039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3323039c6fbaSStefano Zampini size_t bufferSize; 3324039c6fbaSStefano Zampini void *buffer; 3325039c6fbaSStefano Zampini #endif 3326039c6fbaSStefano Zampini 33279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3330039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 3332039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3333039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33345f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 33359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 33369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3338039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3339039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33405f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 33419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33439566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3344039c6fbaSStefano Zampini #else 33459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 3347039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 3348039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 33495f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 33509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 33519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3352039c6fbaSStefano Zampini #endif 33539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 33549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3357039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3358a587d139SMark cublasHandle_t cublasv2handle; 3359a587d139SMark PetscBLASInt one = 1, bnz = 1; 3360039c6fbaSStefano Zampini 33619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 33629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33639566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33649566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz,&bnz)); 33659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33669566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 33679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*bnz)); 33689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 33709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3372039c6fbaSStefano Zampini } else { 33739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 33749566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 3375a587d139SMark } 337695639643SRichard Tran Mills PetscFunctionReturn(0); 337795639643SRichard Tran Mills } 337895639643SRichard Tran Mills 337933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 338033c9ba73SStefano Zampini { 338133c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 338233c9ba73SStefano Zampini PetscScalar *ay; 338333c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 338433c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 338533c9ba73SStefano Zampini 338633c9ba73SStefano Zampini PetscFunctionBegin; 33879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 33889566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 33899566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz,&bnz)); 33909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 33919566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 33929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 33939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 33949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 33959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 339633c9ba73SStefano Zampini PetscFunctionReturn(0); 339733c9ba73SStefano Zampini } 339833c9ba73SStefano Zampini 33993fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 34003fa6b06aSMark Adams { 34017e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3402a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 34037e8381f9SStefano Zampini 34043fa6b06aSMark Adams PetscFunctionBegin; 34053fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 34063fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 34077e8381f9SStefano Zampini if (spptr->mat) { 34087e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 34097e8381f9SStefano Zampini if (matrix->values) { 34107e8381f9SStefano Zampini both = PETSC_TRUE; 34117e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34127e8381f9SStefano Zampini } 34137e8381f9SStefano Zampini } 34147e8381f9SStefano Zampini if (spptr->matTranspose) { 34157e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 34167e8381f9SStefano Zampini if (matrix->values) { 34177e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 34187e8381f9SStefano Zampini } 34197e8381f9SStefano Zampini } 34203fa6b06aSMark Adams } 34219566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 34229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 34237e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3424a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 34253fa6b06aSMark Adams PetscFunctionReturn(0); 34263fa6b06aSMark Adams } 34273fa6b06aSMark Adams 3428a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3429a587d139SMark { 3430a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3431a587d139SMark 3432a587d139SMark PetscFunctionBegin; 34339a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 34349a14fc28SStefano Zampini A->boundtocpu = flg; 34359a14fc28SStefano Zampini PetscFunctionReturn(0); 34369a14fc28SStefano Zampini } 3437a587d139SMark if (flg) { 34389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3439a587d139SMark 344033c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3441a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3442a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3443a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3444a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3445a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3446a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3447a587d139SMark A->ops->multhermitiantranspose = NULL; 3448a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3449fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 34509566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 34519566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 34529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 34539566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 34549566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 34559566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 34569566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 3457a587d139SMark } else { 345833c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3459a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3460a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3461a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3462a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3463a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3464a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3465a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3466a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3467fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 346867a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 346967a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 347067a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 347167a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 347267a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 347367a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 34747ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 34757ee59b9bSJunchao Zhang 34769566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 34779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 34799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 34809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 34819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 3482a587d139SMark } 3483a587d139SMark A->boundtocpu = flg; 3484ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3485ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3486ea500dcfSRichard Tran Mills } else { 3487ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3488ea500dcfSRichard Tran Mills } 3489a587d139SMark PetscFunctionReturn(0); 3490a587d139SMark } 3491a587d139SMark 349249735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 34939ae82921SPaul Mullowney { 349449735bf3SStefano Zampini Mat B; 34959ae82921SPaul Mullowney 34969ae82921SPaul Mullowney PetscFunctionBegin; 34979566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 349849735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 34999566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 350049735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 35019566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 350249735bf3SStefano Zampini } 350349735bf3SStefano Zampini B = *newmat; 350449735bf3SStefano Zampini 35059566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 35069566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 350734136279SStefano Zampini 350849735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 35099ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3510e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 35119566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 35141a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3515d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3516ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 3517a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3518a435da06SStefano Zampini #else 3519d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3520a435da06SStefano Zampini #endif 3521d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3522d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3523d8132acaSStefano Zampini #endif 35241a2c6b5cSJunchao Zhang B->spptr = spptr; 35259ae82921SPaul Mullowney } else { 3526e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3527e6e9a74fSStefano Zampini 35289566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 35299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 35309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 3531e6e9a74fSStefano Zampini B->spptr = spptr; 35329ae82921SPaul Mullowney } 3533e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 353449735bf3SStefano Zampini } 3535693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 35369ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 35371a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 35389ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 353995639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3540693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 35412205254eSKarl Rupp 35429566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 35439566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 35449566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3545ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 35469566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 3547ae48a8d0SStefano Zampini #endif 35489566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 35499ae82921SPaul Mullowney PetscFunctionReturn(0); 35509ae82921SPaul Mullowney } 35519ae82921SPaul Mullowney 355202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 355302fe1965SBarry Smith { 355402fe1965SBarry Smith PetscFunctionBegin; 35559566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 35569566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 355702fe1965SBarry Smith PetscFunctionReturn(0); 355802fe1965SBarry Smith } 355902fe1965SBarry Smith 35603ca39a21SBarry Smith /*MC 3561e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3562e057df02SPaul Mullowney 3563e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 35642692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 35652692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3566e057df02SPaul Mullowney 3567e057df02SPaul Mullowney Options Database Keys: 3568e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3569aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3570a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3571365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3572e057df02SPaul Mullowney 3573e057df02SPaul Mullowney Level: beginner 3574e057df02SPaul Mullowney 3575db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3576e057df02SPaul Mullowney M*/ 35777f756511SDominic Meiser 3578bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 35790f39cd5aSBarry Smith 35803ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 358142c9c57cSBarry Smith { 358242c9c57cSBarry Smith PetscFunctionBegin; 35839566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 35849566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 35859566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 35869566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 35879566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 3588bddcd29dSMark Adams 358942c9c57cSBarry Smith PetscFunctionReturn(0); 359042c9c57cSBarry Smith } 359129b38603SBarry Smith 3592cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3593cbc6b225SStefano Zampini { 3594cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 3595cbc6b225SStefano Zampini 3596cbc6b225SStefano Zampini PetscFunctionBegin; 3597cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3598cbc6b225SStefano Zampini delete cusp->cooPerm; 3599cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3600cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3601cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3602cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 36039566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 36049566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3605cbc6b225SStefano Zampini } 3606cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3607cbc6b225SStefano Zampini PetscFunctionReturn(0); 3608cbc6b225SStefano Zampini } 3609cbc6b225SStefano Zampini 3610470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 36117f756511SDominic Meiser { 36127f756511SDominic Meiser PetscFunctionBegin; 36137f756511SDominic Meiser if (*cusparsestruct) { 36149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 36159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 36167f756511SDominic Meiser delete (*cusparsestruct)->workVector; 361781902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 36187e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 36197e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3620a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 36219566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 36229566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 36239566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 36249566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 36257f756511SDominic Meiser } 36267f756511SDominic Meiser PetscFunctionReturn(0); 36277f756511SDominic Meiser } 36287f756511SDominic Meiser 36297f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 36307f756511SDominic Meiser { 36317f756511SDominic Meiser PetscFunctionBegin; 36327f756511SDominic Meiser if (*mat) { 36337f756511SDominic Meiser delete (*mat)->values; 36347f756511SDominic Meiser delete (*mat)->column_indices; 36357f756511SDominic Meiser delete (*mat)->row_offsets; 36367f756511SDominic Meiser delete *mat; 36377f756511SDominic Meiser *mat = 0; 36387f756511SDominic Meiser } 36397f756511SDominic Meiser PetscFunctionReturn(0); 36407f756511SDominic Meiser } 36417f756511SDominic Meiser 3642470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 36437f756511SDominic Meiser { 36447f756511SDominic Meiser PetscFunctionBegin; 36457f756511SDominic Meiser if (*trifactor) { 36469566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3647*261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 36489566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 36499566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 36509566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3651afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 36529566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3653afb2bd1cSJunchao Zhang #endif 36549566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 36557f756511SDominic Meiser } 36567f756511SDominic Meiser PetscFunctionReturn(0); 36577f756511SDominic Meiser } 36587f756511SDominic Meiser 3659470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 36607f756511SDominic Meiser { 36617f756511SDominic Meiser CsrMatrix *mat; 36627f756511SDominic Meiser 36637f756511SDominic Meiser PetscFunctionBegin; 36647f756511SDominic Meiser if (*matstruct) { 36657f756511SDominic Meiser if ((*matstruct)->mat) { 36667f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3667afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3668afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3669afb2bd1cSJunchao Zhang #else 36707f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 36719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3672afb2bd1cSJunchao Zhang #endif 36737f756511SDominic Meiser } else { 36747f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 36757f756511SDominic Meiser CsrMatrix_Destroy(&mat); 36767f756511SDominic Meiser } 36777f756511SDominic Meiser } 36789566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 36797f756511SDominic Meiser delete (*matstruct)->cprowIndices; 36809566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 36819566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 36829566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3683afb2bd1cSJunchao Zhang 3684afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3685afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 36869566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3687afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3688afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 36899566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 36909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 36919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3692afb2bd1cSJunchao Zhang } 3693afb2bd1cSJunchao Zhang } 3694afb2bd1cSJunchao Zhang #endif 36957f756511SDominic Meiser delete *matstruct; 36967e8381f9SStefano Zampini *matstruct = NULL; 36977f756511SDominic Meiser } 36987f756511SDominic Meiser PetscFunctionReturn(0); 36997f756511SDominic Meiser } 37007f756511SDominic Meiser 3701e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 37027f756511SDominic Meiser { 37037f756511SDominic Meiser PetscFunctionBegin; 37047f756511SDominic Meiser if (*trifactors) { 37059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr)); 37069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr)); 37079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose)); 37089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose)); 37097f756511SDominic Meiser delete (*trifactors)->rpermIndices; 37107f756511SDominic Meiser delete (*trifactors)->cpermIndices; 37117f756511SDominic Meiser delete (*trifactors)->workVector; 37127e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 37137e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 37147e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 37159566063dSJacob Faibussowitsch if ((*trifactors)->a_band_d) PetscCallCUDA(cudaFree((*trifactors)->a_band_d)); 37169566063dSJacob Faibussowitsch if ((*trifactors)->i_band_d) PetscCallCUDA(cudaFree((*trifactors)->i_band_d)); 3717e8d2b73aSMark Adams (*trifactors)->init_dev_prop = PETSC_FALSE; 3718ccdfe979SStefano Zampini } 3719ccdfe979SStefano Zampini PetscFunctionReturn(0); 3720ccdfe979SStefano Zampini } 3721ccdfe979SStefano Zampini 3722ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3723ccdfe979SStefano Zampini { 3724ccdfe979SStefano Zampini cusparseHandle_t handle; 3725ccdfe979SStefano Zampini 3726ccdfe979SStefano Zampini PetscFunctionBegin; 3727ccdfe979SStefano Zampini if (*trifactors) { 37289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 37297f756511SDominic Meiser if (handle = (*trifactors)->handle) { 37309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroy(handle)); 37317f756511SDominic Meiser } 37329566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 37337f756511SDominic Meiser } 37347f756511SDominic Meiser PetscFunctionReturn(0); 37357f756511SDominic Meiser } 37367e8381f9SStefano Zampini 37377e8381f9SStefano Zampini struct IJCompare 37387e8381f9SStefano Zampini { 37397e8381f9SStefano Zampini __host__ __device__ 37407e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37417e8381f9SStefano Zampini { 37427e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 37437e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 37447e8381f9SStefano Zampini return false; 37457e8381f9SStefano Zampini } 37467e8381f9SStefano Zampini }; 37477e8381f9SStefano Zampini 37487e8381f9SStefano Zampini struct IJEqual 37497e8381f9SStefano Zampini { 37507e8381f9SStefano Zampini __host__ __device__ 37517e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 37527e8381f9SStefano Zampini { 37537e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 37547e8381f9SStefano Zampini return true; 37557e8381f9SStefano Zampini } 37567e8381f9SStefano Zampini }; 37577e8381f9SStefano Zampini 37587e8381f9SStefano Zampini struct IJDiff 37597e8381f9SStefano Zampini { 37607e8381f9SStefano Zampini __host__ __device__ 37617e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37627e8381f9SStefano Zampini { 37637e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 37647e8381f9SStefano Zampini } 37657e8381f9SStefano Zampini }; 37667e8381f9SStefano Zampini 37677e8381f9SStefano Zampini struct IJSum 37687e8381f9SStefano Zampini { 37697e8381f9SStefano Zampini __host__ __device__ 37707e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 37717e8381f9SStefano Zampini { 37727e8381f9SStefano Zampini return t1||t2; 37737e8381f9SStefano Zampini } 37747e8381f9SStefano Zampini }; 37757e8381f9SStefano Zampini 37767e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3777219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 3778219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 37797e8381f9SStefano Zampini { 37807e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3781fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3782bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 378308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 37847e8381f9SStefano Zampini CsrMatrix *matrix; 37857e8381f9SStefano Zampini PetscInt n; 37867e8381f9SStefano Zampini 37877e8381f9SStefano Zampini PetscFunctionBegin; 378828b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 378928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 37907e8381f9SStefano Zampini if (!cusp->cooPerm) { 37919566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 37929566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 37937e8381f9SStefano Zampini PetscFunctionReturn(0); 37947e8381f9SStefano Zampini } 37957e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 379628b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3797e61fc153SStefano Zampini if (!v) { 3798e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3799e61fc153SStefano Zampini goto finalize; 38007e8381f9SStefano Zampini } 3801e61fc153SStefano Zampini n = cusp->cooPerm->size(); 380208391a17SStefano Zampini if (isCudaMem(v)) { 380308391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 380408391a17SStefano Zampini } else { 3805e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3806e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 380708391a17SStefano Zampini d_v = cooPerm_v->data(); 38089566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 380908391a17SStefano Zampini } 38109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3811e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 3812ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 3813bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 381408391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3815ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 3816ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 3817ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 3818ddea5d60SJunchao Zhang */ 3819e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3820e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3821e61fc153SStefano Zampini delete cooPerm_w; 38227e8381f9SStefano Zampini } else { 3823ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 382408391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38257e8381f9SStefano Zampini matrix->values->begin())); 382608391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38277e8381f9SStefano Zampini matrix->values->end())); 3828ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 38297e8381f9SStefano Zampini } 38307e8381f9SStefano Zampini } else { 3831e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 383208391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3833e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 38347e8381f9SStefano Zampini } else { 383508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 38367e8381f9SStefano Zampini matrix->values->begin())); 383708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 38387e8381f9SStefano Zampini matrix->values->end())); 38397e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 38407e8381f9SStefano Zampini } 38417e8381f9SStefano Zampini } 38429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3843e61fc153SStefano Zampini finalize: 3844e61fc153SStefano Zampini delete cooPerm_v; 38457e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 38469566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 3847fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 38489566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 38499566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 38509566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 3851fcdce8c4SStefano Zampini a->reallocs = 0; 3852fcdce8c4SStefano Zampini A->info.mallocs += 0; 3853fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3854fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3855fcdce8c4SStefano Zampini A->num_ass++; 38567e8381f9SStefano Zampini PetscFunctionReturn(0); 38577e8381f9SStefano Zampini } 38587e8381f9SStefano Zampini 3859a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 3860a49f1ed0SStefano Zampini { 3861a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3862a49f1ed0SStefano Zampini 3863a49f1ed0SStefano Zampini PetscFunctionBegin; 3864a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3865a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3866a49f1ed0SStefano Zampini if (destroy) { 38679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 3868a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 3869a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 3870a49f1ed0SStefano Zampini } 38711a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 3872a49f1ed0SStefano Zampini PetscFunctionReturn(0); 3873a49f1ed0SStefano Zampini } 3874a49f1ed0SStefano Zampini 38757e8381f9SStefano Zampini #include <thrust/binary_search.h> 3876219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 3877219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 38787e8381f9SStefano Zampini { 38797e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 38807e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 38817e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 38827e8381f9SStefano Zampini 38837e8381f9SStefano Zampini PetscFunctionBegin; 38849566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 38859566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 38867e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 38877e8381f9SStefano Zampini if (n != cooPerm_n) { 38887e8381f9SStefano Zampini delete cusp->cooPerm; 38897e8381f9SStefano Zampini delete cusp->cooPerm_a; 38907e8381f9SStefano Zampini cusp->cooPerm = NULL; 38917e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 38927e8381f9SStefano Zampini } 38937e8381f9SStefano Zampini if (n) { 38947e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 38957e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 38967e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 38977e8381f9SStefano Zampini 38987e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 38997e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 39007e8381f9SStefano Zampini 39019566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 39027e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 39037e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 3904ddea5d60SJunchao Zhang 3905ddea5d60SJunchao Zhang /* Ex. 3906ddea5d60SJunchao Zhang n = 6 3907ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 3908ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 3909ddea5d60SJunchao Zhang */ 39107e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 39117e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 39127e8381f9SStefano Zampini 39139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39147e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 3915ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 3916ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 39177e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 39187e8381f9SStefano Zampini 3919ddea5d60SJunchao Zhang /* 3920ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 3921ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 3922ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 3923ddea5d60SJunchao Zhang */ 3924ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 3925ddea5d60SJunchao Zhang 3926ddea5d60SJunchao Zhang /* 3927ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 3928ddea5d60SJunchao Zhang ^ekey 3929ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 3930ddea5d60SJunchao Zhang ^nekye 3931ddea5d60SJunchao Zhang */ 39327e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 39337e8381f9SStefano Zampini delete cusp->cooPerm_a; 39347e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 3935ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 3936ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 3937ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 3938ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 3939ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 39407e8381f9SStefano Zampini w[0] = 0; 3941ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 3942ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 39437e8381f9SStefano Zampini } 39447e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 3945ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 3946ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 3947ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 39489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39497e8381f9SStefano Zampini 39509566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 39517e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 39527e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 39537e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 39549566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 3955ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 39569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39577e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3958fcdce8c4SStefano Zampini a->rmax = 0; 39599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->a)); 39609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->j)); 39619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 39629566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 39639566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 39647e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 39657e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 39667e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 39677e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3968fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 39697e8381f9SStefano Zampini } 3970fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 39717e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 39729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 39739566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 39747e8381f9SStefano Zampini } else { 39759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 39767e8381f9SStefano Zampini } 39779566063dSJacob Faibussowitsch PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 39787e8381f9SStefano Zampini 39797e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3980e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 39819566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->nz)); 39829566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 39837e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 39849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 39859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 39867e8381f9SStefano Zampini PetscFunctionReturn(0); 39877e8381f9SStefano Zampini } 3988ed502f03SStefano Zampini 3989219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 3990219fbbafSJunchao Zhang { 3991219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 3992219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 3993cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 3994219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 3995219fbbafSJunchao Zhang 3996219fbbafSJunchao Zhang PetscFunctionBegin; 39979566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 39989566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 3999219fbbafSJunchao Zhang if (coo_i) { 40009566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 4001219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4002219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4003cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4004219fbbafSJunchao Zhang } 4005219fbbafSJunchao Zhang } 4006219fbbafSJunchao Zhang } 4007219fbbafSJunchao Zhang 4008219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 40099566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4010219fbbafSJunchao Zhang } else { 40119566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4012cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 40139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4014219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4015219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 40169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 40179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 40189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 40199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4020219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4021219fbbafSJunchao Zhang } 4022219fbbafSJunchao Zhang PetscFunctionReturn(0); 4023219fbbafSJunchao Zhang } 4024219fbbafSJunchao Zhang 402577804d84SJunchao Zhang __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4026219fbbafSJunchao Zhang { 4027219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4028219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4029b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4030b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4031b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4032b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4033b6c38306SJunchao Zhang } 4034219fbbafSJunchao Zhang } 4035219fbbafSJunchao Zhang 4036219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4037219fbbafSJunchao Zhang { 4038219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4039219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4040219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4041219fbbafSJunchao Zhang PetscMemType memtype; 4042219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4043219fbbafSJunchao Zhang PetscScalar *Aa; 4044219fbbafSJunchao Zhang 4045219fbbafSJunchao Zhang PetscFunctionBegin; 4046219fbbafSJunchao Zhang if (dev->use_extended_coo) { 40479566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 4048219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 40499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 40509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4051219fbbafSJunchao Zhang } 4052219fbbafSJunchao Zhang 40539566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 40549566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4055219fbbafSJunchao Zhang 4056cbc6b225SStefano Zampini if (Annz) { 4057b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 40589566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4059cbc6b225SStefano Zampini } 4060219fbbafSJunchao Zhang 40619566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 40629566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4063219fbbafSJunchao Zhang 40649566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4065219fbbafSJunchao Zhang } else { 40669566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4067219fbbafSJunchao Zhang } 4068219fbbafSJunchao Zhang PetscFunctionReturn(0); 4069219fbbafSJunchao Zhang } 4070219fbbafSJunchao Zhang 40715b7e41feSStefano Zampini /*@C 40725b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 40735b7e41feSStefano Zampini 40745b7e41feSStefano Zampini Not collective 40755b7e41feSStefano Zampini 40765b7e41feSStefano Zampini Input Parameters: 40775b7e41feSStefano Zampini + A - the matrix 40785b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 40795b7e41feSStefano Zampini 40805b7e41feSStefano Zampini Output Parameters: 40815b7e41feSStefano Zampini + ia - the CSR row pointers 40825b7e41feSStefano Zampini - ja - the CSR column indices 40835b7e41feSStefano Zampini 40845b7e41feSStefano Zampini Level: developer 40855b7e41feSStefano Zampini 40865b7e41feSStefano Zampini Notes: 40875b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 40885b7e41feSStefano Zampini 4089db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 40905b7e41feSStefano Zampini @*/ 40915f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 40925f101d05SStefano Zampini { 40935f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 40945f101d05SStefano Zampini CsrMatrix *csr; 40955f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 40965f101d05SStefano Zampini 40975f101d05SStefano Zampini PetscFunctionBegin; 40985f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 40995f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 41005f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4101aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 410328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 41045f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 41055f101d05SStefano Zampini if (i) { 41065f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 41075f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 41085f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 41095f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 41109566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 41115f101d05SStefano Zampini } 41125f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 41135f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 41145f101d05SStefano Zampini } 41155f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 41165f101d05SStefano Zampini PetscFunctionReturn(0); 41175f101d05SStefano Zampini } 41185f101d05SStefano Zampini 41195b7e41feSStefano Zampini /*@C 41205b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 41215b7e41feSStefano Zampini 41225b7e41feSStefano Zampini Not collective 41235b7e41feSStefano Zampini 41245b7e41feSStefano Zampini Input Parameters: 41255b7e41feSStefano Zampini + A - the matrix 41265b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 41275b7e41feSStefano Zampini 41285b7e41feSStefano Zampini Output Parameters: 41295b7e41feSStefano Zampini + ia - the CSR row pointers 41305b7e41feSStefano Zampini - ja - the CSR column indices 41315b7e41feSStefano Zampini 41325b7e41feSStefano Zampini Level: developer 41335b7e41feSStefano Zampini 4134db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 41355b7e41feSStefano Zampini @*/ 41365f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 41375f101d05SStefano Zampini { 41385f101d05SStefano Zampini PetscFunctionBegin; 41395f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 41405f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 41415f101d05SStefano Zampini if (i) *i = NULL; 41425f101d05SStefano Zampini if (j) *j = NULL; 41435f101d05SStefano Zampini PetscFunctionReturn(0); 41445f101d05SStefano Zampini } 41455f101d05SStefano Zampini 41465b7e41feSStefano Zampini /*@C 41475b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 41485b7e41feSStefano Zampini 41495b7e41feSStefano Zampini Not Collective 41505b7e41feSStefano Zampini 41515b7e41feSStefano Zampini Input Parameter: 41525b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41535b7e41feSStefano Zampini 41545b7e41feSStefano Zampini Output Parameter: 41555b7e41feSStefano Zampini . a - pointer to the device data 41565b7e41feSStefano Zampini 41575b7e41feSStefano Zampini Level: developer 41585b7e41feSStefano Zampini 41595b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 41605b7e41feSStefano Zampini 4161db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 41625b7e41feSStefano Zampini @*/ 4163ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4164ed502f03SStefano Zampini { 4165ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4166ed502f03SStefano Zampini CsrMatrix *csr; 4167ed502f03SStefano Zampini 4168ed502f03SStefano Zampini PetscFunctionBegin; 4169ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4170ed502f03SStefano Zampini PetscValidPointer(a,2); 4171ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4172aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 41739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 417428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4175ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 417628b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4177ed502f03SStefano Zampini *a = csr->values->data().get(); 4178ed502f03SStefano Zampini PetscFunctionReturn(0); 4179ed502f03SStefano Zampini } 4180ed502f03SStefano Zampini 41815b7e41feSStefano Zampini /*@C 41825b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 41835b7e41feSStefano Zampini 41845b7e41feSStefano Zampini Not Collective 41855b7e41feSStefano Zampini 41865b7e41feSStefano Zampini Input Parameter: 41875b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 41885b7e41feSStefano Zampini 41895b7e41feSStefano Zampini Output Parameter: 41905b7e41feSStefano Zampini . a - pointer to the device data 41915b7e41feSStefano Zampini 41925b7e41feSStefano Zampini Level: developer 41935b7e41feSStefano Zampini 4194db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 41955b7e41feSStefano Zampini @*/ 4196ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4197ed502f03SStefano Zampini { 4198ed502f03SStefano Zampini PetscFunctionBegin; 4199ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4200ed502f03SStefano Zampini PetscValidPointer(a,2); 4201ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4202ed502f03SStefano Zampini *a = NULL; 4203ed502f03SStefano Zampini PetscFunctionReturn(0); 4204ed502f03SStefano Zampini } 4205ed502f03SStefano Zampini 42065b7e41feSStefano Zampini /*@C 42075b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42085b7e41feSStefano Zampini 42095b7e41feSStefano Zampini Not Collective 42105b7e41feSStefano Zampini 42115b7e41feSStefano Zampini Input Parameter: 42125b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42135b7e41feSStefano Zampini 42145b7e41feSStefano Zampini Output Parameter: 42155b7e41feSStefano Zampini . a - pointer to the device data 42165b7e41feSStefano Zampini 42175b7e41feSStefano Zampini Level: developer 42185b7e41feSStefano Zampini 42195b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 42205b7e41feSStefano Zampini 4221db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 42225b7e41feSStefano Zampini @*/ 4223039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4224039c6fbaSStefano Zampini { 4225039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4226039c6fbaSStefano Zampini CsrMatrix *csr; 4227039c6fbaSStefano Zampini 4228039c6fbaSStefano Zampini PetscFunctionBegin; 4229039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4230039c6fbaSStefano Zampini PetscValidPointer(a,2); 4231039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4232aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 42339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 423428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4235039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 423628b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4237039c6fbaSStefano Zampini *a = csr->values->data().get(); 4238039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 42399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4240039c6fbaSStefano Zampini PetscFunctionReturn(0); 4241039c6fbaSStefano Zampini } 42425b7e41feSStefano Zampini /*@C 42435b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4244039c6fbaSStefano Zampini 42455b7e41feSStefano Zampini Not Collective 42465b7e41feSStefano Zampini 42475b7e41feSStefano Zampini Input Parameter: 42485b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42495b7e41feSStefano Zampini 42505b7e41feSStefano Zampini Output Parameter: 42515b7e41feSStefano Zampini . a - pointer to the device data 42525b7e41feSStefano Zampini 42535b7e41feSStefano Zampini Level: developer 42545b7e41feSStefano Zampini 4255db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 42565b7e41feSStefano Zampini @*/ 4257039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 4258039c6fbaSStefano Zampini { 4259039c6fbaSStefano Zampini PetscFunctionBegin; 4260039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4261039c6fbaSStefano Zampini PetscValidPointer(a,2); 4262039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 42639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 42649566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4265039c6fbaSStefano Zampini *a = NULL; 4266039c6fbaSStefano Zampini PetscFunctionReturn(0); 4267039c6fbaSStefano Zampini } 4268039c6fbaSStefano Zampini 42695b7e41feSStefano Zampini /*@C 42705b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 42715b7e41feSStefano Zampini 42725b7e41feSStefano Zampini Not Collective 42735b7e41feSStefano Zampini 42745b7e41feSStefano Zampini Input Parameter: 42755b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 42765b7e41feSStefano Zampini 42775b7e41feSStefano Zampini Output Parameter: 42785b7e41feSStefano Zampini . a - pointer to the device data 42795b7e41feSStefano Zampini 42805b7e41feSStefano Zampini Level: developer 42815b7e41feSStefano Zampini 42825b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 42835b7e41feSStefano Zampini 4284db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 42855b7e41feSStefano Zampini @*/ 4286ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 4287ed502f03SStefano Zampini { 4288ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4289ed502f03SStefano Zampini CsrMatrix *csr; 4290ed502f03SStefano Zampini 4291ed502f03SStefano Zampini PetscFunctionBegin; 4292ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4293ed502f03SStefano Zampini PetscValidPointer(a,2); 4294ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4295aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 429628b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4297ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 429828b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4299ed502f03SStefano Zampini *a = csr->values->data().get(); 4300039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 43019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 4302ed502f03SStefano Zampini PetscFunctionReturn(0); 4303ed502f03SStefano Zampini } 4304ed502f03SStefano Zampini 43055b7e41feSStefano Zampini /*@C 43065b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 43075b7e41feSStefano Zampini 43085b7e41feSStefano Zampini Not Collective 43095b7e41feSStefano Zampini 43105b7e41feSStefano Zampini Input Parameter: 43115b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43125b7e41feSStefano Zampini 43135b7e41feSStefano Zampini Output Parameter: 43145b7e41feSStefano Zampini . a - pointer to the device data 43155b7e41feSStefano Zampini 43165b7e41feSStefano Zampini Level: developer 43175b7e41feSStefano Zampini 4318db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 43195b7e41feSStefano Zampini @*/ 4320ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 4321ed502f03SStefano Zampini { 4322ed502f03SStefano Zampini PetscFunctionBegin; 4323ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4324ed502f03SStefano Zampini PetscValidPointer(a,2); 4325ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 43269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 43279566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4328ed502f03SStefano Zampini *a = NULL; 4329ed502f03SStefano Zampini PetscFunctionReturn(0); 4330ed502f03SStefano Zampini } 4331ed502f03SStefano Zampini 4332ed502f03SStefano Zampini struct IJCompare4 4333ed502f03SStefano Zampini { 4334ed502f03SStefano Zampini __host__ __device__ 43352ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4336ed502f03SStefano Zampini { 4337ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4338ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4339ed502f03SStefano Zampini return false; 4340ed502f03SStefano Zampini } 4341ed502f03SStefano Zampini }; 4342ed502f03SStefano Zampini 43438909a122SStefano Zampini struct Shift 43448909a122SStefano Zampini { 4345ed502f03SStefano Zampini int _shift; 4346ed502f03SStefano Zampini 4347ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 4348ed502f03SStefano Zampini __host__ __device__ 4349ed502f03SStefano Zampini inline int operator() (const int &c) 4350ed502f03SStefano Zampini { 4351ed502f03SStefano Zampini return c + _shift; 4352ed502f03SStefano Zampini } 4353ed502f03SStefano Zampini }; 4354ed502f03SStefano Zampini 4355ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4356ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 4357ed502f03SStefano Zampini { 4358ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 4359ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 4360ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4361ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 4362ed502f03SStefano Zampini PetscInt Annz,Bnnz; 4363ed502f03SStefano Zampini cusparseStatus_t stat; 4364ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 4365ed502f03SStefano Zampini 4366ed502f03SStefano Zampini PetscFunctionBegin; 4367ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4368ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 4369ed502f03SStefano Zampini PetscValidPointer(C,4); 4370ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4371ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 43725f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 437308401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 4374aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4375aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 4376ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4377ed502f03SStefano Zampini m = A->rmap->n; 4378ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 43799566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,C)); 43809566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C,m,n,m,n)); 43819566063dSJacob Faibussowitsch PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 4382ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4383ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 4384ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4385ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4386ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4387ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4388ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4389ed502f03SStefano Zampini c->compressedrow.i = NULL; 4390ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4391ed502f03SStefano Zampini Ccusp->workVector = NULL; 4392ed502f03SStefano Zampini Ccusp->nrows = m; 4393ed502f03SStefano Zampini Ccusp->mat = Cmat; 4394ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4395ed502f03SStefano Zampini Ccsr->num_rows = m; 4396ed502f03SStefano Zampini Ccsr->num_cols = n; 43979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 43989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 43999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 44009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 44019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 44029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 44039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 44069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 44079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 440828b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 440928b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4410ed502f03SStefano Zampini 4411ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4412ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4413ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4414ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4415ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4416ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 4417ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4418ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4419ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4420ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4421ed502f03SStefano Zampini if (c->nz) { 44222ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 44232ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 44242ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 44252ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 44262ed87e7eSStefano Zampini 4427ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4428ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4429ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4430ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 44319566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 4432ed502f03SStefano Zampini } 44332ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 44342ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4435ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4436ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4437ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4438ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 44399566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 4440ed502f03SStefano Zampini } 44412ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 44422ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 44439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 44442ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 44452ed87e7eSStefano Zampini Aroff->data().get(), 44462ed87e7eSStefano Zampini Annz, 44472ed87e7eSStefano Zampini m, 44482ed87e7eSStefano Zampini Acoo->data().get(), 44499566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 4450ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 44512ed87e7eSStefano Zampini Broff->data().get(), 4452ed502f03SStefano Zampini Bnnz, 4453ed502f03SStefano Zampini m, 44542ed87e7eSStefano Zampini Bcoo->data().get(), 44559566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 44562ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 44572ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 44582ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 44598909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 4460ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 4461ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 44628909a122SStefano Zampini #else 44638909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 44648909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 44658909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 44668909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 44678909a122SStefano Zampini #endif 44682ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 44692ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 44702ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 44712ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 44722ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 44732ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 4474ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4475ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4476ed502f03SStefano Zampini thrust::advance(p2,Annz); 44772ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 44788909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 44798909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 44808909a122SStefano Zampini #endif 44812ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 44822ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 44832ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 44842ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 44852ed87e7eSStefano Zampini #else 44862ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 44872ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 44882ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 44892ed87e7eSStefano Zampini #endif 4490ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 44912ed87e7eSStefano Zampini Ccoo->data().get(), 4492ed502f03SStefano Zampini c->nz, 4493ed502f03SStefano Zampini m, 4494ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 44959566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 44969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 44972ed87e7eSStefano Zampini delete wPerm; 44982ed87e7eSStefano Zampini delete Acoo; 44992ed87e7eSStefano Zampini delete Bcoo; 45002ed87e7eSStefano Zampini delete Ccoo; 4501ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4502ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 4503ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 4504ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45059566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4506ed502f03SStefano Zampini #endif 45071a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 45089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 45099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4510ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4511ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4512ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4513ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4514ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4515ed502f03SStefano Zampini 45161a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 45171a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4518a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4519ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4520ed502f03SStefano Zampini CmatT->mat = CcsrT; 4521ed502f03SStefano Zampini CcsrT->num_rows = n; 4522ed502f03SStefano Zampini CcsrT->num_cols = m; 4523ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4524ed502f03SStefano Zampini 4525ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 4526ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4527ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4528ed502f03SStefano Zampini 45299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4530ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4531ed502f03SStefano Zampini if (AT) { 4532ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 4533ed502f03SStefano Zampini thrust::advance(rT,-1); 4534ed502f03SStefano Zampini } 4535ed502f03SStefano Zampini if (BT) { 4536ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 4537ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 4538ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 4539ed502f03SStefano Zampini } 4540ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4541ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 4542ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 4543ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4544ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4545ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 45469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4547ed502f03SStefano Zampini 45489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 45499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 45509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 45519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 45529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 45539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 45549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 45569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4557ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4558ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 4559ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 4560ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 45619566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 4562ed502f03SStefano Zampini #endif 4563ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4564ed502f03SStefano Zampini } 4565ed502f03SStefano Zampini } 4566ed502f03SStefano Zampini 4567ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4568ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4569ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 45709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 45719566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 4572ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4573ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4574ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4575ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4576ed502f03SStefano Zampini jj = *Ccsr->column_indices; 45779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4579ed502f03SStefano Zampini } else { 45809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 45819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 4582ed502f03SStefano Zampini } 45839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 45849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 45859566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 4586ed502f03SStefano Zampini c->maxnz = c->nz; 4587ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4588ed502f03SStefano Zampini c->rmax = 0; 4589ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4590ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 4591ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4592ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4593ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 4594ed502f03SStefano Zampini } 45959566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 45969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 4597ed502f03SStefano Zampini (*C)->nonzerostate++; 45989566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 45999566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4600ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4601ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4602ed502f03SStefano Zampini } else { 460308401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 4604ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 4605ed502f03SStefano Zampini if (c->nz) { 4606ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 46075f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 4608aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 460908401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 46109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 46119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 46125f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 46135f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4614ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 4615ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 4616ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 4617aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 4618aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 4619aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 4620aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 46215f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 4622ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4623ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 46249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4625ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 4626ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 4627ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 4628ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4629ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 4630ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 4631ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 4632ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 4633ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 4634ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 46359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 46361a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 46375f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4638ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4639ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 4640ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4641ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4642ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4643ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4644ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 46451a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4646ed502f03SStefano Zampini } 46479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4648ed502f03SStefano Zampini } 4649ed502f03SStefano Zampini } 46509566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4651ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4652ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4653ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4654ed502f03SStefano Zampini PetscFunctionReturn(0); 4655ed502f03SStefano Zampini } 4656c215019aSStefano Zampini 4657c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4658c215019aSStefano Zampini { 4659c215019aSStefano Zampini bool dmem; 4660c215019aSStefano Zampini const PetscScalar *av; 4661c215019aSStefano Zampini 4662c215019aSStefano Zampini PetscFunctionBegin; 4663c215019aSStefano Zampini dmem = isCudaMem(v); 46649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 4665c215019aSStefano Zampini if (n && idx) { 4666c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4667c215019aSStefano Zampini widx.assign(idx,idx+n); 46689566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 4669c215019aSStefano Zampini 4670c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4671c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4672c215019aSStefano Zampini if (dmem) { 4673c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4674c215019aSStefano Zampini } else { 4675c215019aSStefano Zampini w = new THRUSTARRAY(n); 4676c215019aSStefano Zampini dv = w->data(); 4677c215019aSStefano Zampini } 4678c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4679c215019aSStefano Zampini 4680c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 4681c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 4682c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4683c215019aSStefano Zampini if (w) { 46849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 4685c215019aSStefano Zampini } 4686c215019aSStefano Zampini delete w; 4687c215019aSStefano Zampini } else { 46889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4689c215019aSStefano Zampini } 46909566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 46919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 4692c215019aSStefano Zampini PetscFunctionReturn(0); 4693c215019aSStefano Zampini } 4694