19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96ca45077fSPaul Mullowney { 97aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 986e111a19SKarl Rupp 99ca45077fSPaul Mullowney PetscFunctionBegin; 100ca45077fSPaul Mullowney switch (op) { 101e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 102aa372e3fSPaul Mullowney cusparsestruct->format = format; 103ca45077fSPaul Mullowney break; 104e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 105aa372e3fSPaul Mullowney cusparsestruct->format = format; 106ca45077fSPaul Mullowney break; 107ca45077fSPaul Mullowney default: 10898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109ca45077fSPaul Mullowney } 110ca45077fSPaul Mullowney PetscFunctionReturn(0); 111ca45077fSPaul Mullowney } 1129ae82921SPaul Mullowney 113e057df02SPaul Mullowney /*@ 114e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 116aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 117e057df02SPaul Mullowney Not Collective 118e057df02SPaul Mullowney 119e057df02SPaul Mullowney Input Parameters: 1208468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 12136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1222692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123e057df02SPaul Mullowney 124e057df02SPaul Mullowney Output Parameter: 125e057df02SPaul Mullowney 126e057df02SPaul Mullowney Level: intermediate 127e057df02SPaul Mullowney 128db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129e057df02SPaul Mullowney @*/ 130e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131e057df02SPaul Mullowney { 132e057df02SPaul Mullowney PetscFunctionBegin; 133e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135e057df02SPaul Mullowney PetscFunctionReturn(0); 136e057df02SPaul Mullowney } 137e057df02SPaul Mullowney 138365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139365b711fSMark Adams { 140365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141365b711fSMark Adams 142365b711fSMark Adams PetscFunctionBegin; 143365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 144365b711fSMark Adams PetscFunctionReturn(0); 145365b711fSMark Adams } 146365b711fSMark Adams 147365b711fSMark Adams /*@ 148365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149365b711fSMark Adams 150365b711fSMark Adams Input Parameters: 151365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 152365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 153365b711fSMark Adams 154365b711fSMark Adams Output Parameter: 155365b711fSMark Adams 156365b711fSMark Adams Notes: 157365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 158365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160365b711fSMark Adams 161365b711fSMark Adams Level: intermediate 162365b711fSMark Adams 163db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164365b711fSMark Adams @*/ 165365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166365b711fSMark Adams { 167365b711fSMark Adams PetscFunctionBegin; 168365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170365b711fSMark Adams PetscFunctionReturn(0); 171365b711fSMark Adams } 172365b711fSMark Adams 1731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174e6e9a74fSStefano Zampini { 175e6e9a74fSStefano Zampini PetscFunctionBegin; 1761a2c6b5cSJunchao Zhang switch (op) { 1771a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1781a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1799566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1801a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1811a2c6b5cSJunchao Zhang break; 1821a2c6b5cSJunchao Zhang default: 1839566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 1841a2c6b5cSJunchao Zhang break; 185e6e9a74fSStefano Zampini } 186e6e9a74fSStefano Zampini PetscFunctionReturn(0); 187e6e9a74fSStefano Zampini } 188e6e9a74fSStefano Zampini 189bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190bddcd29dSMark Adams 191bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192bddcd29dSMark Adams { 193bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 195bddcd29dSMark Adams PetscBool row_identity,col_identity; 196365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197bddcd29dSMark Adams 198bddcd29dSMark Adams PetscFunctionBegin; 1999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2009566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 202bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2039566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 2049566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 205f93f8571SJunchao Zhang 206365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 207f93f8571SJunchao Zhang if (row_identity && col_identity) { 208bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210bddcd29dSMark Adams } else { 211bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213365b711fSMark Adams } 214f93f8571SJunchao Zhang } 215bddcd29dSMark Adams B->ops->matsolve = NULL; 216bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 217bddcd29dSMark Adams 218bddcd29dSMark Adams /* get the triangular factors */ 219365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 2209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221365b711fSMark Adams } 222bddcd29dSMark Adams PetscFunctionReturn(0); 223bddcd29dSMark Adams } 224bddcd29dSMark Adams 2254416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2269ae82921SPaul Mullowney { 227e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2289ae82921SPaul Mullowney PetscBool flg; 229a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2306e111a19SKarl Rupp 2319ae82921SPaul Mullowney PetscFunctionBegin; 232d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 2339ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 234d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2369566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237afb2bd1cSJunchao Zhang 238d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2409566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 2419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 2429566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245d0609cedSBarry Smith "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 248aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249a435da06SStefano Zampini #else 250aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251a435da06SStefano Zampini #endif 252d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253d0609cedSBarry Smith "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255afb2bd1cSJunchao Zhang 256d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257d0609cedSBarry Smith "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259afb2bd1cSJunchao Zhang #endif 2604c87dfd4SPaul Mullowney } 261d0609cedSBarry Smith PetscOptionsHeadEnd(); 2629ae82921SPaul Mullowney PetscFunctionReturn(0); 2639ae82921SPaul Mullowney } 2649ae82921SPaul Mullowney 265087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2669ae82921SPaul Mullowney { 2679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2689ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2699ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2719ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2729ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2739ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2749ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2759ae82921SPaul Mullowney 2769ae82921SPaul Mullowney PetscFunctionBegin; 277cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 278c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2799ae82921SPaul Mullowney try { 2809ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2819ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 282da79fbbcSStefano Zampini if (!loTriFactor) { 2832cbc15d9SMark PetscScalar *AALo; 2842cbc15d9SMark 2859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 2869ae82921SPaul Mullowney 2879ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 2899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 2909ae82921SPaul Mullowney 2919ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2929ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2939ae82921SPaul Mullowney AiLo[n] = nzLower; 2949ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2959ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2969ae82921SPaul Mullowney v = aa; 2979ae82921SPaul Mullowney vi = aj; 2989ae82921SPaul Mullowney offset = 1; 2999ae82921SPaul Mullowney rowOffset= 1; 3009ae82921SPaul Mullowney for (i=1; i<n; i++) { 3019ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 302e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3039ae82921SPaul Mullowney AiLo[i] = rowOffset; 3049ae82921SPaul Mullowney rowOffset += nz+1; 3059ae82921SPaul Mullowney 3069566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 3079566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney offset += nz; 3109ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3119ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3129ae82921SPaul Mullowney offset += 1; 3139ae82921SPaul Mullowney 3149ae82921SPaul Mullowney v += nz; 3159ae82921SPaul Mullowney vi += nz; 3169ae82921SPaul Mullowney } 3172205254eSKarl Rupp 318aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3199566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 320da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321aa372e3fSPaul Mullowney /* Create the matrix description */ 3229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 3259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326afb2bd1cSJunchao Zhang #else 3279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328afb2bd1cSJunchao Zhang #endif 3299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331aa372e3fSPaul Mullowney 332aa372e3fSPaul Mullowney /* set the operation */ 333aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334aa372e3fSPaul Mullowney 335aa372e3fSPaul Mullowney /* set the matrix */ 336aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 337aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 338aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 339aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 340aa372e3fSPaul Mullowney 341aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343aa372e3fSPaul Mullowney 344aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346aa372e3fSPaul Mullowney 347aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349aa372e3fSPaul Mullowney 350afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3519566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3531b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 3585f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 3599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360afb2bd1cSJunchao Zhang #endif 361afb2bd1cSJunchao Zhang 362aa372e3fSPaul Mullowney /* perform the solve analysis */ 363261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 3671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368d49cd2b7SBarry Smith loTriFactor->solveInfo, 3695f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370d49cd2b7SBarry Smith #else 3715f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 372afb2bd1cSJunchao Zhang #endif 3739566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 3749566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375aa372e3fSPaul Mullowney 376da79fbbcSStefano Zampini /* assign the pointer */ 377aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3782cbc15d9SMark loTriFactor->AA_h = AALo; 3799566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 3809566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 3819566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382da79fbbcSStefano Zampini } else { /* update values only */ 3832cbc15d9SMark if (!loTriFactor->AA_h) { 3849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 3852cbc15d9SMark } 386da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 3872cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 388da79fbbcSStefano Zampini v = aa; 389da79fbbcSStefano Zampini vi = aj; 390da79fbbcSStefano Zampini offset = 1; 391da79fbbcSStefano Zampini for (i=1; i<n; i++) { 392da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 3939566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394da79fbbcSStefano Zampini offset += nz; 3952cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 396da79fbbcSStefano Zampini offset += 1; 397da79fbbcSStefano Zampini v += nz; 398da79fbbcSStefano Zampini } 3992cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 4009566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401da79fbbcSStefano Zampini } 4029ae82921SPaul Mullowney } catch(char *ex) { 40398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4049ae82921SPaul Mullowney } 4059ae82921SPaul Mullowney } 4069ae82921SPaul Mullowney PetscFunctionReturn(0); 4079ae82921SPaul Mullowney } 4089ae82921SPaul Mullowney 409087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4109ae82921SPaul Mullowney { 4119ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4129ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4139ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4159ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4169ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4179ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4189ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4199ae82921SPaul Mullowney 4209ae82921SPaul Mullowney PetscFunctionBegin; 421cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 422c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4239ae82921SPaul Mullowney try { 4249ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4259ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 426da79fbbcSStefano Zampini if (!upTriFactor) { 4272cbc15d9SMark PetscScalar *AAUp; 4282cbc15d9SMark 4299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 4302cbc15d9SMark 4319ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 4339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 4349ae82921SPaul Mullowney 4359ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4369ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4379ae82921SPaul Mullowney AiUp[n]=nzUpper; 4389ae82921SPaul Mullowney offset = nzUpper; 4399ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4409ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4419ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4429ae82921SPaul Mullowney 443e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4449ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4459ae82921SPaul Mullowney 446e057df02SPaul Mullowney /* decrement the offset */ 4479ae82921SPaul Mullowney offset -= (nz+1); 4489ae82921SPaul Mullowney 449e057df02SPaul Mullowney /* first, set the diagonal elements */ 4509ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 45109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4529ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4539ae82921SPaul Mullowney 4549566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 4559566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 4569ae82921SPaul Mullowney } 4572205254eSKarl Rupp 458aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4599566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 460da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4612205254eSKarl Rupp 462aa372e3fSPaul Mullowney /* Create the matrix description */ 4639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467afb2bd1cSJunchao Zhang #else 4689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469afb2bd1cSJunchao Zhang #endif 4709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472aa372e3fSPaul Mullowney 473aa372e3fSPaul Mullowney /* set the operation */ 474aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475aa372e3fSPaul Mullowney 476aa372e3fSPaul Mullowney /* set the matrix */ 477aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 478aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 479aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 480aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 481aa372e3fSPaul Mullowney 482aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484aa372e3fSPaul Mullowney 485aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487aa372e3fSPaul Mullowney 488aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490aa372e3fSPaul Mullowney 491afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4929566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 4941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 4995f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 5009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501afb2bd1cSJunchao Zhang #endif 502afb2bd1cSJunchao Zhang 503aa372e3fSPaul Mullowney /* perform the solve analysis */ 504261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 5081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509d49cd2b7SBarry Smith upTriFactor->solveInfo, 5105f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511d49cd2b7SBarry Smith #else 5125f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 513afb2bd1cSJunchao Zhang #endif 5149566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5159566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516aa372e3fSPaul Mullowney 517da79fbbcSStefano Zampini /* assign the pointer */ 518aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5192cbc15d9SMark upTriFactor->AA_h = AAUp; 5209566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5219566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5229566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523da79fbbcSStefano Zampini } else { 5242cbc15d9SMark if (!upTriFactor->AA_h) { 5259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 5262cbc15d9SMark } 527da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 528da79fbbcSStefano Zampini offset = nzUpper; 529da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 530da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 531da79fbbcSStefano Zampini 532da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 533da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 534da79fbbcSStefano Zampini 535da79fbbcSStefano Zampini /* decrement the offset */ 536da79fbbcSStefano Zampini offset -= (nz+1); 537da79fbbcSStefano Zampini 538da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5392cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 5409566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541da79fbbcSStefano Zampini } 5422cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 5439566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544da79fbbcSStefano Zampini } 5459ae82921SPaul Mullowney } catch(char *ex) { 54698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5479ae82921SPaul Mullowney } 5489ae82921SPaul Mullowney } 5499ae82921SPaul Mullowney PetscFunctionReturn(0); 5509ae82921SPaul Mullowney } 5519ae82921SPaul Mullowney 552087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5539ae82921SPaul Mullowney { 5549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5559ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 5569ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 5579ae82921SPaul Mullowney PetscBool row_identity,col_identity; 5589ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5599ae82921SPaul Mullowney 5609ae82921SPaul Mullowney PetscFunctionBegin; 56128b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 5629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 5642205254eSKarl Rupp 565da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 5679ae82921SPaul Mullowney 568c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 569e057df02SPaul Mullowney /* lower triangular indices */ 5709566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 571da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 572da79fbbcSStefano Zampini const PetscInt *r; 573da79fbbcSStefano Zampini 5749566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r)); 575aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 5779566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r)); 5789566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579da79fbbcSStefano Zampini } 5809ae82921SPaul Mullowney 581e057df02SPaul Mullowney /* upper triangular indices */ 5829566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 583da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 584da79fbbcSStefano Zampini const PetscInt *c; 585da79fbbcSStefano Zampini 5869566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&c)); 587aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 5899566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&c)); 5909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591da79fbbcSStefano Zampini } 5929ae82921SPaul Mullowney PetscFunctionReturn(0); 5939ae82921SPaul Mullowney } 5949ae82921SPaul Mullowney 595087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596087f3262SPaul Mullowney { 597087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 602087f3262SPaul Mullowney PetscScalar *AAUp; 603087f3262SPaul Mullowney PetscScalar *AALo; 604087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 607087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 608087f3262SPaul Mullowney 609087f3262SPaul Mullowney PetscFunctionBegin; 610cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 611c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612087f3262SPaul Mullowney try { 6139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 6149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 616087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 6179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 6189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619087f3262SPaul Mullowney 620087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 621087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 622087f3262SPaul Mullowney AiUp[n]=nzUpper; 623087f3262SPaul Mullowney offset = 0; 624087f3262SPaul Mullowney for (i=0; i<n; i++) { 625087f3262SPaul Mullowney /* set the pointers */ 626087f3262SPaul Mullowney v = aa + ai[i]; 627087f3262SPaul Mullowney vj = aj + ai[i]; 628087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629087f3262SPaul Mullowney 630087f3262SPaul Mullowney /* first, set the diagonal elements */ 631087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 63209f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 633087f3262SPaul Mullowney AiUp[i] = offset; 63409f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 635087f3262SPaul Mullowney 636087f3262SPaul Mullowney offset+=1; 637087f3262SPaul Mullowney if (nz>0) { 6389566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 6399566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 641087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 642087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 643087f3262SPaul Mullowney } 644087f3262SPaul Mullowney offset+=nz; 645087f3262SPaul Mullowney } 646087f3262SPaul Mullowney } 647087f3262SPaul Mullowney 648aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6499566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 650da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651087f3262SPaul Mullowney 652aa372e3fSPaul Mullowney /* Create the matrix description */ 6539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657afb2bd1cSJunchao Zhang #else 6589566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659afb2bd1cSJunchao Zhang #endif 6609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662087f3262SPaul Mullowney 663aa372e3fSPaul Mullowney /* set the matrix */ 664aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 665aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 666aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 667aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 668aa372e3fSPaul Mullowney 669aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671aa372e3fSPaul Mullowney 672aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674aa372e3fSPaul Mullowney 675aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677aa372e3fSPaul Mullowney 678afb2bd1cSJunchao Zhang /* set the operation */ 679afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680afb2bd1cSJunchao Zhang 681afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6829566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 6841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 6895f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 6909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691afb2bd1cSJunchao Zhang #endif 692afb2bd1cSJunchao Zhang 693aa372e3fSPaul Mullowney /* perform the solve analysis */ 694261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699d49cd2b7SBarry Smith upTriFactor->solveInfo, 7005f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701d49cd2b7SBarry Smith #else 7025f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 703afb2bd1cSJunchao Zhang #endif 7049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7059566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706aa372e3fSPaul Mullowney 707da79fbbcSStefano Zampini /* assign the pointer */ 708aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709aa372e3fSPaul Mullowney 710aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7119566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 712da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713aa372e3fSPaul Mullowney 714aa372e3fSPaul Mullowney /* Create the matrix description */ 7159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 7169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 7171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719afb2bd1cSJunchao Zhang #else 7209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721afb2bd1cSJunchao Zhang #endif 7229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724aa372e3fSPaul Mullowney 725aa372e3fSPaul Mullowney /* set the operation */ 726aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727aa372e3fSPaul Mullowney 728aa372e3fSPaul Mullowney /* set the matrix */ 729aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 730aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 731aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 732aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 733aa372e3fSPaul Mullowney 734aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736aa372e3fSPaul Mullowney 737aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739aa372e3fSPaul Mullowney 740aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742aa372e3fSPaul Mullowney 743afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7449566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 7461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 7515f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 7529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753afb2bd1cSJunchao Zhang #endif 754afb2bd1cSJunchao Zhang 755aa372e3fSPaul Mullowney /* perform the solve analysis */ 756261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 7601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761d49cd2b7SBarry Smith loTriFactor->solveInfo, 7625f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763d49cd2b7SBarry Smith #else 7645f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 765afb2bd1cSJunchao Zhang #endif 7669566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7679566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768aa372e3fSPaul Mullowney 769da79fbbcSStefano Zampini /* assign the pointer */ 770aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771087f3262SPaul Mullowney 7729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 7739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 7749566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 775da79fbbcSStefano Zampini } else { 776da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 777da79fbbcSStefano Zampini offset = 0; 778da79fbbcSStefano Zampini for (i=0; i<n; i++) { 779da79fbbcSStefano Zampini /* set the pointers */ 780da79fbbcSStefano Zampini v = aa + ai[i]; 781da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782da79fbbcSStefano Zampini 783da79fbbcSStefano Zampini /* first, set the diagonal elements */ 784da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 785da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 786da79fbbcSStefano Zampini 787da79fbbcSStefano Zampini offset+=1; 788da79fbbcSStefano Zampini if (nz>0) { 7899566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 791da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 792da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 793da79fbbcSStefano Zampini } 794da79fbbcSStefano Zampini offset+=nz; 795da79fbbcSStefano Zampini } 796da79fbbcSStefano Zampini } 79728b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 79828b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 8019566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802da79fbbcSStefano Zampini } 8039566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 8049566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 805087f3262SPaul Mullowney } catch(char *ex) { 80698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807087f3262SPaul Mullowney } 808087f3262SPaul Mullowney } 809087f3262SPaul Mullowney PetscFunctionReturn(0); 810087f3262SPaul Mullowney } 811087f3262SPaul Mullowney 812087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8139ae82921SPaul Mullowney { 814087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816087f3262SPaul Mullowney IS ip = a->row; 817087f3262SPaul Mullowney PetscBool perm_identity; 818087f3262SPaul Mullowney PetscInt n = A->rmap->n; 819087f3262SPaul Mullowney 820087f3262SPaul Mullowney PetscFunctionBegin; 82128b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 8229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825aa372e3fSPaul Mullowney 826da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 827da79fbbcSStefano Zampini 828087f3262SPaul Mullowney /* lower triangular indices */ 8299566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 830087f3262SPaul Mullowney if (!perm_identity) { 8314e4bbfaaSStefano Zampini IS iip; 832da79fbbcSStefano Zampini const PetscInt *irip,*rip; 8334e4bbfaaSStefano Zampini 8349566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 8359566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip,&irip)); 8369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip,&rip)); 837aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8404e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8419566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip,&irip)); 8429566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 8439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip,&rip)); 8449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845da79fbbcSStefano Zampini } 846087f3262SPaul Mullowney PetscFunctionReturn(0); 847087f3262SPaul Mullowney } 848087f3262SPaul Mullowney 849087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850087f3262SPaul Mullowney { 851087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852087f3262SPaul Mullowney IS ip = b->row; 853087f3262SPaul Mullowney PetscBool perm_identity; 854087f3262SPaul Mullowney 855087f3262SPaul Mullowney PetscFunctionBegin; 8569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 8579566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 859087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8609566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 861087f3262SPaul Mullowney if (perm_identity) { 862087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8644e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8654e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 866087f3262SPaul Mullowney } else { 867087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8694e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8704e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 871087f3262SPaul Mullowney } 872087f3262SPaul Mullowney 873087f3262SPaul Mullowney /* get the triangular factors */ 8749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875087f3262SPaul Mullowney PetscFunctionReturn(0); 876087f3262SPaul Mullowney } 8779ae82921SPaul Mullowney 878b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879bda325fcSPaul Mullowney { 880bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 886aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 887aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 888aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 889b175d8bbSPaul Mullowney 890bda325fcSPaul Mullowney PetscFunctionBegin; 891aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 8929566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 893da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894aa372e3fSPaul Mullowney 895aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 896aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 897aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 901aa372e3fSPaul Mullowney 902aa372e3fSPaul Mullowney /* Create the matrix description */ 9039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 9049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 9059566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 9069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 9079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908aa372e3fSPaul Mullowney 909aa372e3fSPaul Mullowney /* set the operation */ 910aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911aa372e3fSPaul Mullowney 912aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 913aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 914afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920aa372e3fSPaul Mullowney 921aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 9239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 926afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 927afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 928afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 929afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 9315f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 9329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933afb2bd1cSJunchao Zhang #endif 934afb2bd1cSJunchao Zhang 9359566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 9369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 939aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 940aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 941aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 942afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 9455f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946afb2bd1cSJunchao Zhang #else 947afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 9485f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 949afb2bd1cSJunchao Zhang #endif 9509566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9519566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952aa372e3fSPaul Mullowney 953afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9549566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 9561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 9615f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 9629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963afb2bd1cSJunchao Zhang #endif 964afb2bd1cSJunchao Zhang 965afb2bd1cSJunchao Zhang /* perform the solve analysis */ 966261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 9701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971d49cd2b7SBarry Smith loTriFactorT->solveInfo, 9725f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973d49cd2b7SBarry Smith #else 9745f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 975afb2bd1cSJunchao Zhang #endif 9769566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9779566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978aa372e3fSPaul Mullowney 979da79fbbcSStefano Zampini /* assign the pointer */ 980aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981aa372e3fSPaul Mullowney 982aa372e3fSPaul Mullowney /*********************************************/ 983aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 984aa372e3fSPaul Mullowney /*********************************************/ 985aa372e3fSPaul Mullowney 986aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 9879566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 988da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989aa372e3fSPaul Mullowney 990aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 991aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 992aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 996aa372e3fSPaul Mullowney 997aa372e3fSPaul Mullowney /* Create the matrix description */ 9989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 9999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 10009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 10019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 10029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003aa372e3fSPaul Mullowney 1004aa372e3fSPaul Mullowney /* set the operation */ 1005aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006aa372e3fSPaul Mullowney 1007aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1008aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1009afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015aa372e3fSPaul Mullowney 1016aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1021afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1022afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1023afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1024afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10265f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 10279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028afb2bd1cSJunchao Zhang #endif 1029afb2bd1cSJunchao Zhang 10309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1034aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1035aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1036aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10405f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041afb2bd1cSJunchao Zhang #else 1042afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 10435f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1044afb2bd1cSJunchao Zhang #endif 1045d49cd2b7SBarry Smith 10469566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10479566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048aa372e3fSPaul Mullowney 1049afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 10509566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 10521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 10575f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 10589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059afb2bd1cSJunchao Zhang #endif 1060afb2bd1cSJunchao Zhang 1061afb2bd1cSJunchao Zhang /* perform the solve analysis */ 10625f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 1063261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 10671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068d49cd2b7SBarry Smith upTriFactorT->solveInfo, 10695f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070d49cd2b7SBarry Smith #else 10715f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1072afb2bd1cSJunchao Zhang #endif 1073d49cd2b7SBarry Smith 10749566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10759566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076aa372e3fSPaul Mullowney 1077da79fbbcSStefano Zampini /* assign the pointer */ 1078aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079bda325fcSPaul Mullowney PetscFunctionReturn(0); 1080bda325fcSPaul Mullowney } 1081bda325fcSPaul Mullowney 1082a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1083a49f1ed0SStefano Zampini { 1084a49f1ed0SStefano Zampini __host__ __device__ 1085a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1086a49f1ed0SStefano Zampini { 1087a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1088a49f1ed0SStefano Zampini } 1089a49f1ed0SStefano Zampini }; 1090a49f1ed0SStefano Zampini 10913606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092bda325fcSPaul Mullowney { 1093aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096bda325fcSPaul Mullowney cusparseStatus_t stat; 1097aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1098b175d8bbSPaul Mullowney 1099bda325fcSPaul Mullowney PetscFunctionBegin; 11009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 110228b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 110408401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 11051a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 11069566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1108a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 11099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110a49f1ed0SStefano Zampini } 1111a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 11139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 11159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 11169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117aa372e3fSPaul Mullowney 1118b06137fdSPaul Mullowney /* set alpha and beta */ 11199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 11209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 11219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 11229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125b06137fdSPaul Mullowney 1126aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1128a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1129554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1130554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1131aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1132a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1135a3fdcf43SKarl Rupp 1136039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 113781902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138afb2bd1cSJunchao Zhang 1139afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11403606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1142afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1145afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 11469566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 11473606e59fSJunchao Zhang #else 11483606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 11493606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 11503606e59fSJunchao Zhang 11513606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 11523606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 11533606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 11543606e59fSJunchao Zhang */ 11553606e59fSJunchao Zhang if (matrixT->num_entries) { 11563606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 11573606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 11583606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 11593606e59fSJunchao Zhang matrixT->values->data().get(), 11603606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 11619566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 11623606e59fSJunchao Zhang 11633606e59fSJunchao Zhang } else { 11643606e59fSJunchao Zhang matstructT->matDescr = NULL; 11653606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 11663606e59fSJunchao Zhang } 11673606e59fSJunchao Zhang #endif 1168afb2bd1cSJunchao Zhang #endif 1169aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172afb2bd1cSJunchao Zhang #else 1173aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 117451c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 117551c6d536SStefano Zampini /* First convert HYB to CSR */ 1176aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1177aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1178aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1179aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1182aa372e3fSPaul Mullowney 1183aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1184aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185aa372e3fSPaul Mullowney temp->values->data().get(), 1186aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 11879566063dSJacob Faibussowitsch temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188aa372e3fSPaul Mullowney 1189aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1191aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1192aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1193aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1196aa372e3fSPaul Mullowney 1197aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1199aa372e3fSPaul Mullowney temp->values->data().get(), 1200aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1201aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1202aa372e3fSPaul Mullowney tempT->values->data().get(), 1203aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1204aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 12059566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206aa372e3fSPaul Mullowney 1207aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1208aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 12099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1214aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1215aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 12169566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217aa372e3fSPaul Mullowney 1218aa372e3fSPaul Mullowney /* assign the pointer */ 1219aa372e3fSPaul Mullowney matstructT->mat = hybMat; 12201a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1221aa372e3fSPaul Mullowney /* delete temporaries */ 1222aa372e3fSPaul Mullowney if (tempT) { 1223aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1227087f3262SPaul Mullowney } 1228aa372e3fSPaul Mullowney if (temp) { 1229aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1230aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1233aa372e3fSPaul Mullowney } 1234afb2bd1cSJunchao Zhang #endif 1235aa372e3fSPaul Mullowney } 1236a49f1ed0SStefano Zampini } 1237a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 124028b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 124128b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 124228b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 124328b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 124428b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 124528b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 124628b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 124728b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 12519566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252a49f1ed0SStefano Zampini } 1253a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1254a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1255a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256a49f1ed0SStefano Zampini 1257a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259a49f1ed0SStefano Zampini void *csr2cscBuffer; 1260a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1261a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1263a49f1ed0SStefano Zampini matrix->values->data().get(), 1264a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1265a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1266a49f1ed0SStefano Zampini matrixT->values->data().get(), 1267a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 12699566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 12709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271a49f1ed0SStefano Zampini #endif 1272a49f1ed0SStefano Zampini 12731a2c6b5cSJunchao Zhang if (matrix->num_entries) { 12741a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 12751a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 12761a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 12771a2c6b5cSJunchao Zhang 12781a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 12791a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 12801a2c6b5cSJunchao Zhang */ 12811a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 12821a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 12831a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 12841a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 12851a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1286a49f1ed0SStefano Zampini matrixT->values->data().get(), 1287a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 12909566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291a49f1ed0SStefano Zampini #else 1292a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 12939566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294a49f1ed0SStefano Zampini #endif 12951a2c6b5cSJunchao Zhang } else { 12961a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12971a2c6b5cSJunchao Zhang } 12981a2c6b5cSJunchao Zhang 1299a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 13029566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303a49f1ed0SStefano Zampini #endif 1304a49f1ed0SStefano Zampini } 1305a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307a49f1ed0SStefano Zampini matrixT->values->begin())); 1308a49f1ed0SStefano Zampini } 13099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13109566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1312213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1313aa372e3fSPaul Mullowney /* assign the pointer */ 1314aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 13151a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1316bda325fcSPaul Mullowney PetscFunctionReturn(0); 1317bda325fcSPaul Mullowney } 1318bda325fcSPaul Mullowney 1319a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13206fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321bda325fcSPaul Mullowney { 1322c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1323465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1324465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1325465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1326465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1327bda325fcSPaul Mullowney cusparseStatus_t stat; 1328bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332bda325fcSPaul Mullowney 1333bda325fcSPaul Mullowney PetscFunctionBegin; 1334aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1335aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 13369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339bda325fcSPaul Mullowney } 1340bda325fcSPaul Mullowney 1341bda325fcSPaul Mullowney /* Get the GPU pointers */ 13429566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 13439566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1345c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1346bda325fcSPaul Mullowney 13479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1348aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1349a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351c41cb2e2SAlejandro Lamas Daviña xGPU); 1352aa372e3fSPaul Mullowney 1353aa372e3fSPaul Mullowney /* First, solve U */ 1354261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 13561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1358afb2bd1cSJunchao Zhang #endif 1359afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1361aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1362aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1363aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1364d49cd2b7SBarry Smith xarray, 13651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366d49cd2b7SBarry Smith tempGPU->data().get(), 13679566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368d49cd2b7SBarry Smith #else 13699566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1370afb2bd1cSJunchao Zhang #endif 1371aa372e3fSPaul Mullowney 1372aa372e3fSPaul Mullowney /* Then, solve L */ 1373261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 13751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1377afb2bd1cSJunchao Zhang #endif 1378afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1380aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1381aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1382aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1383d49cd2b7SBarry Smith tempGPU->data().get(), 13841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385d49cd2b7SBarry Smith xarray, 13869566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387d49cd2b7SBarry Smith #else 13889566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1389afb2bd1cSJunchao Zhang #endif 1390aa372e3fSPaul Mullowney 1391aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394aa372e3fSPaul Mullowney tempGPU->begin()); 1395aa372e3fSPaul Mullowney 1396aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1397a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398bda325fcSPaul Mullowney 1399bda325fcSPaul Mullowney /* restore */ 14009566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 14019566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 14029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404bda325fcSPaul Mullowney PetscFunctionReturn(0); 1405bda325fcSPaul Mullowney } 1406bda325fcSPaul Mullowney 14076fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408bda325fcSPaul Mullowney { 1409465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1410465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1411bda325fcSPaul Mullowney cusparseStatus_t stat; 1412bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416bda325fcSPaul Mullowney 1417bda325fcSPaul Mullowney PetscFunctionBegin; 1418aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1419aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423bda325fcSPaul Mullowney } 1424bda325fcSPaul Mullowney 1425bda325fcSPaul Mullowney /* Get the GPU pointers */ 14269566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14279566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428bda325fcSPaul Mullowney 14299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1430aa372e3fSPaul Mullowney /* First, solve U */ 1431261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1435afb2bd1cSJunchao Zhang #endif 1436afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1438aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1439aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1440aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1441d49cd2b7SBarry Smith barray, 14421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443d49cd2b7SBarry Smith tempGPU->data().get(), 14449566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445d49cd2b7SBarry Smith #else 14469566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1447afb2bd1cSJunchao Zhang #endif 1448aa372e3fSPaul Mullowney 1449aa372e3fSPaul Mullowney /* Then, solve L */ 1450261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1454afb2bd1cSJunchao Zhang #endif 1455afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1457aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1458aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1459aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1460d49cd2b7SBarry Smith tempGPU->data().get(), 14611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462d49cd2b7SBarry Smith xarray, 14639566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464d49cd2b7SBarry Smith #else 14659566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1466afb2bd1cSJunchao Zhang #endif 1467bda325fcSPaul Mullowney 1468bda325fcSPaul Mullowney /* restore */ 14699566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 14709566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 14719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473bda325fcSPaul Mullowney PetscFunctionReturn(0); 1474bda325fcSPaul Mullowney } 1475bda325fcSPaul Mullowney 14766fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 14779ae82921SPaul Mullowney { 1478465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1479465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1480465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1481465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 14829ae82921SPaul Mullowney cusparseStatus_t stat; 14839ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 14879ae82921SPaul Mullowney 14889ae82921SPaul Mullowney PetscFunctionBegin; 1489ebc8f436SDominic Meiser 1490e057df02SPaul Mullowney /* Get the GPU pointers */ 14919566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14929566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1494c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 14959ae82921SPaul Mullowney 14969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1497aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1498a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15004e4bbfaaSStefano Zampini tempGPU->begin()); 1501aa372e3fSPaul Mullowney 1502aa372e3fSPaul Mullowney /* Next, solve L */ 1503261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1507afb2bd1cSJunchao Zhang #endif 1508afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1510aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1511aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1512aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1513d49cd2b7SBarry Smith tempGPU->data().get(), 15141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515d49cd2b7SBarry Smith xarray, 15169566063dSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517d49cd2b7SBarry Smith #else 15189566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1519afb2bd1cSJunchao Zhang #endif 1520aa372e3fSPaul Mullowney 1521aa372e3fSPaul Mullowney /* Then, solve U */ 1522261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1526afb2bd1cSJunchao Zhang #endif 1527afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1529aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1530aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1531d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 15321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533d49cd2b7SBarry Smith tempGPU->data().get(), 15349566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535d49cd2b7SBarry Smith #else 15369566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1537afb2bd1cSJunchao Zhang #endif 1538d49cd2b7SBarry Smith 15394e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1540a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15414e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15424e4bbfaaSStefano Zampini xGPU); 15439ae82921SPaul Mullowney 15449566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15459566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 15489ae82921SPaul Mullowney PetscFunctionReturn(0); 15499ae82921SPaul Mullowney } 15509ae82921SPaul Mullowney 15516fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15529ae82921SPaul Mullowney { 1553465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1554465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15559ae82921SPaul Mullowney cusparseStatus_t stat; 15569ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 15609ae82921SPaul Mullowney 15619ae82921SPaul Mullowney PetscFunctionBegin; 1562e057df02SPaul Mullowney /* Get the GPU pointers */ 15639566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 15649566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 15659ae82921SPaul Mullowney 15669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1567aa372e3fSPaul Mullowney /* First, solve L */ 1568261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1572afb2bd1cSJunchao Zhang #endif 1573afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1575aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1576aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1577aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1578d49cd2b7SBarry Smith barray, 15791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580d49cd2b7SBarry Smith tempGPU->data().get(), 15819566063dSJacob Faibussowitsch loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582d49cd2b7SBarry Smith #else 15839566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1584afb2bd1cSJunchao Zhang #endif 1585d49cd2b7SBarry Smith 1586aa372e3fSPaul Mullowney /* Next, solve U */ 1587261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1591afb2bd1cSJunchao Zhang #endif 1592afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1594aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1595aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1596aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1597d49cd2b7SBarry Smith tempGPU->data().get(), 15981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599d49cd2b7SBarry Smith xarray, 16009566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601d49cd2b7SBarry Smith #else 16029566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1603afb2bd1cSJunchao Zhang #endif 16049ae82921SPaul Mullowney 16059566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 16069566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 16079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16099ae82921SPaul Mullowney PetscFunctionReturn(0); 16109ae82921SPaul Mullowney } 16119ae82921SPaul Mullowney 1612da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1613da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615da112707SJunchao Zhang { 1616da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618da112707SJunchao Zhang const PetscScalar *barray; 1619da112707SJunchao Zhang PetscScalar *xarray; 1620da112707SJunchao Zhang 1621da112707SJunchao Zhang PetscFunctionBegin; 1622da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1625da112707SJunchao Zhang 1626da112707SJunchao Zhang /* Solve L*y = b */ 1627da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1631da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1632da112707SJunchao Zhang fs->spMatDescr_L, /* L Y = X */ 1633da112707SJunchao Zhang fs->dnVecDescr_X, 1634da112707SJunchao Zhang fs->dnVecDescr_Y, 1635da112707SJunchao Zhang cusparse_scalartype, 1636da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1637*12ba2bc6SJunchao Zhang fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1638da112707SJunchao Zhang 1639da112707SJunchao Zhang /* Solve U*x = y */ 1640da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1643da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1644da112707SJunchao Zhang fs->spMatDescr_U, /* U X = Y */ 1645da112707SJunchao Zhang fs->dnVecDescr_Y, 1646da112707SJunchao Zhang fs->dnVecDescr_X, 1647da112707SJunchao Zhang cusparse_scalartype, 1648da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1649da112707SJunchao Zhang fs->spsvDescr_U)); 1650da112707SJunchao Zhang 1651da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653da112707SJunchao Zhang 1654da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1655da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656da112707SJunchao Zhang PetscFunctionReturn(0); 1657da112707SJunchao Zhang } 1658da112707SJunchao Zhang 1659da112707SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660da112707SJunchao Zhang { 1661da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663da112707SJunchao Zhang const PetscScalar *barray; 1664da112707SJunchao Zhang PetscScalar *xarray; 1665da112707SJunchao Zhang 1666da112707SJunchao Zhang PetscFunctionBegin; 1667*12ba2bc6SJunchao Zhang if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1668da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1671da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1672da112707SJunchao Zhang fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673da112707SJunchao Zhang fs->dnVecDescr_X, 1674da112707SJunchao Zhang fs->dnVecDescr_Y, 1675da112707SJunchao Zhang cusparse_scalartype, 1676da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1677da112707SJunchao Zhang fs->spsvDescr_Lt, 1678da112707SJunchao Zhang &fs->spsvBufferSize_Lt)); 1679da112707SJunchao Zhang 1680da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1683da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1684da112707SJunchao Zhang fs->spMatDescr_U, 1685da112707SJunchao Zhang fs->dnVecDescr_X, 1686da112707SJunchao Zhang fs->dnVecDescr_Y, 1687da112707SJunchao Zhang cusparse_scalartype, 1688da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1689da112707SJunchao Zhang fs->spsvDescr_Ut, 1690da112707SJunchao Zhang &fs->spsvBufferSize_Ut)); 1691da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1692*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693*12ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 1694*12ba2bc6SJunchao Zhang } 1695da112707SJunchao Zhang 1696*12ba2bc6SJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 1697da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1698da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1699da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1700da112707SJunchao Zhang fs->spMatDescr_L, 1701da112707SJunchao Zhang fs->dnVecDescr_X, 1702da112707SJunchao Zhang fs->dnVecDescr_Y, 1703da112707SJunchao Zhang cusparse_scalartype, 1704da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1705da112707SJunchao Zhang fs->spsvDescr_Lt, 1706da112707SJunchao Zhang fs->spsvBuffer_Lt)); 1707da112707SJunchao Zhang 1708da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1709da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1710da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1711da112707SJunchao Zhang fs->spMatDescr_U, 1712da112707SJunchao Zhang fs->dnVecDescr_X, 1713da112707SJunchao Zhang fs->dnVecDescr_Y, 1714da112707SJunchao Zhang cusparse_scalartype, 1715da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1716da112707SJunchao Zhang fs->spsvDescr_Ut, 1717da112707SJunchao Zhang fs->spsvBuffer_Ut)); 1718*12ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1719da112707SJunchao Zhang } 1720da112707SJunchao Zhang 1721da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1722da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 1723da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1724da112707SJunchao Zhang 1725da112707SJunchao Zhang /* Solve Ut*y = b */ 1726da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1727da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1728da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1729da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1730da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1731da112707SJunchao Zhang fs->spMatDescr_U, /* Ut Y = X */ 1732da112707SJunchao Zhang fs->dnVecDescr_X, 1733da112707SJunchao Zhang fs->dnVecDescr_Y, 1734da112707SJunchao Zhang cusparse_scalartype, 1735da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1736da112707SJunchao Zhang fs->spsvDescr_Ut)); 1737da112707SJunchao Zhang 1738da112707SJunchao Zhang /* Solve Lt*x = y */ 1739da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1740da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1741da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1742da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1743da112707SJunchao Zhang fs->spMatDescr_L, /* Lt X = Y */ 1744da112707SJunchao Zhang fs->dnVecDescr_Y, 1745da112707SJunchao Zhang fs->dnVecDescr_X, 1746da112707SJunchao Zhang cusparse_scalartype, 1747da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1748da112707SJunchao Zhang fs->spsvDescr_Lt)); 1749da112707SJunchao Zhang 1750da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1751da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1752da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1753da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1754da112707SJunchao Zhang PetscFunctionReturn(0); 1755da112707SJunchao Zhang } 1756da112707SJunchao Zhang 1757da112707SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1758da112707SJunchao Zhang { 1759da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1760da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1761da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1762da112707SJunchao Zhang CsrMatrix *Acsr; 1763da112707SJunchao Zhang PetscInt m,nz; 1764da112707SJunchao Zhang PetscBool flg; 1765da112707SJunchao Zhang 1766da112707SJunchao Zhang PetscFunctionBegin; 1767da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1768da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1769da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1770da112707SJunchao Zhang } 1771da112707SJunchao Zhang 1772da112707SJunchao Zhang /* Copy A's value to fact */ 1773da112707SJunchao Zhang m = fact->rmap->n; 1774da112707SJunchao Zhang nz = aij->nz; 1775da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1776da112707SJunchao Zhang Acsr = (CsrMatrix*)Acusp->mat->mat; 1777da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1778da112707SJunchao Zhang 1779da112707SJunchao Zhang /* Factorize fact inplace */ 1780da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1781da112707SJunchao Zhang fs->matDescr_M, 1782da112707SJunchao Zhang fs->csrVal, 1783da112707SJunchao Zhang fs->csrRowPtr, 1784da112707SJunchao Zhang fs->csrColIdx, 1785da112707SJunchao Zhang fs->ilu0Info_M, 1786da112707SJunchao Zhang fs->policy_M, 1787da112707SJunchao Zhang fs->factBuffer_M)); 1788da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1789da112707SJunchao Zhang int numerical_zero; 1790da112707SJunchao Zhang cusparseStatus_t status; 1791da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1792da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1793da112707SJunchao Zhang } 1794da112707SJunchao Zhang 1795*12ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1796*12ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1797*12ba2bc6SJunchao Zhang */ 1798da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1799da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1800da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1801da112707SJunchao Zhang fs->spMatDescr_L, 1802da112707SJunchao Zhang fs->dnVecDescr_X, 1803da112707SJunchao Zhang fs->dnVecDescr_Y, 1804da112707SJunchao Zhang cusparse_scalartype, 1805da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1806da112707SJunchao Zhang fs->spsvDescr_L, 1807da112707SJunchao Zhang fs->spsvBuffer_L)); 1808da112707SJunchao Zhang 1809da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1810da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1811da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1812da112707SJunchao Zhang fs->spMatDescr_U, 1813da112707SJunchao Zhang fs->dnVecDescr_X, 1814da112707SJunchao Zhang fs->dnVecDescr_Y, 1815da112707SJunchao Zhang cusparse_scalartype, 1816da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1817da112707SJunchao Zhang fs->spsvDescr_U, 1818da112707SJunchao Zhang fs->spsvBuffer_U)); 1819da112707SJunchao Zhang 1820*12ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1821*12ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1822*12ba2bc6SJunchao Zhang 1823da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1824da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1825da112707SJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1826da112707SJunchao Zhang fact->ops->matsolve = NULL; 1827da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1828da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1829da112707SJunchao Zhang PetscFunctionReturn(0); 1830da112707SJunchao Zhang } 1831da112707SJunchao Zhang 1832da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1833da112707SJunchao Zhang { 1834da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1835da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1836da112707SJunchao Zhang PetscInt m,nz; 1837da112707SJunchao Zhang 1838da112707SJunchao Zhang PetscFunctionBegin; 1839da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1840da112707SJunchao Zhang PetscInt i; 1841da112707SJunchao Zhang PetscBool flg,missing; 1842da112707SJunchao Zhang 1843da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1844da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1845da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1846da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A,&missing,&i)); 1847da112707SJunchao Zhang PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1848da112707SJunchao Zhang } 1849da112707SJunchao Zhang 1850da112707SJunchao Zhang /* Free the old stale stuff */ 1851da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1852da112707SJunchao Zhang 1853da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1854da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1855da112707SJunchao Zhang */ 1856da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1857da112707SJunchao Zhang 1858da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1859da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1860da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1861da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1862da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1863da112707SJunchao Zhang 1864da112707SJunchao Zhang aij->row = NULL; 1865da112707SJunchao Zhang aij->col = NULL; 1866da112707SJunchao Zhang 1867da112707SJunchao Zhang /* ====================================================================== */ 1868da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1869da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1870da112707SJunchao Zhang /* ====================================================================== */ 1871da112707SJunchao Zhang const int *Ai,*Aj; 1872da112707SJunchao Zhang 1873da112707SJunchao Zhang m = fact->rmap->n; 1874da112707SJunchao Zhang nz = aij->nz; 1875da112707SJunchao Zhang 1876da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1877da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1878da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1879da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1880da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1881da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1882da112707SJunchao Zhang 1883da112707SJunchao Zhang /* ====================================================================== */ 1884da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1885da112707SJunchao Zhang /* ====================================================================== */ 1886da112707SJunchao Zhang cusparseFillMode_t fillMode; 1887da112707SJunchao Zhang cusparseDiagType_t diagType; 1888da112707SJunchao Zhang 1889da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1890da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1891da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1892da112707SJunchao Zhang 1893da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1894da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1895da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1896da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1897da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1898da112707SJunchao Zhang */ 1899da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1900da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1901da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1902da112707SJunchao Zhang fs->csrRowPtr, 1903da112707SJunchao Zhang fs->csrColIdx, 1904da112707SJunchao Zhang fs->csrVal, 1905da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1906da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1907da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 1908da112707SJunchao Zhang cusparse_scalartype)); 1909da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1910da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 1911da112707SJunchao Zhang &fillMode, 1912da112707SJunchao Zhang sizeof(fillMode))); 1913da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1914da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 1915da112707SJunchao Zhang &diagType, 1916da112707SJunchao Zhang sizeof(diagType))); 1917da112707SJunchao Zhang 1918da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1919da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1920da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1921da112707SJunchao Zhang fs->csrRowPtr, 1922da112707SJunchao Zhang fs->csrColIdx, 1923da112707SJunchao Zhang fs->csrVal, 1924da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1925da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1926da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 1927da112707SJunchao Zhang cusparse_scalartype)); 1928da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1929da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 1930da112707SJunchao Zhang &fillMode, 1931da112707SJunchao Zhang sizeof(fillMode))); 1932da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1933da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 1934da112707SJunchao Zhang &diagType, 1935da112707SJunchao Zhang sizeof(diagType))); 1936da112707SJunchao Zhang 1937da112707SJunchao Zhang /* ========================================================================= */ 1938da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1939da112707SJunchao Zhang /* ========================================================================= */ 1940da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1941da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1942da112707SJunchao Zhang fs->matDescr_M, 1943da112707SJunchao Zhang fs->csrVal, 1944da112707SJunchao Zhang fs->csrRowPtr, 1945da112707SJunchao Zhang fs->csrColIdx, 1946da112707SJunchao Zhang fs->ilu0Info_M, 1947da112707SJunchao Zhang &fs->factBufferSize_M)); 1948da112707SJunchao Zhang 1949da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1950da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1951da112707SJunchao Zhang 1952da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1953da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1954da112707SJunchao Zhang 1955da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1956da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1957da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1958da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1959da112707SJunchao Zhang fs->spMatDescr_L, 1960da112707SJunchao Zhang fs->dnVecDescr_X, 1961da112707SJunchao Zhang fs->dnVecDescr_Y, 1962da112707SJunchao Zhang cusparse_scalartype, 1963da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1964da112707SJunchao Zhang fs->spsvDescr_L, 1965da112707SJunchao Zhang &fs->spsvBufferSize_L)); 1966da112707SJunchao Zhang 1967da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1968da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1969da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1970da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1971da112707SJunchao Zhang fs->spMatDescr_U, 1972da112707SJunchao Zhang fs->dnVecDescr_X, 1973da112707SJunchao Zhang fs->dnVecDescr_Y, 1974da112707SJunchao Zhang cusparse_scalartype, 1975da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1976da112707SJunchao Zhang fs->spsvDescr_U, 1977da112707SJunchao Zhang &fs->spsvBufferSize_U)); 1978da112707SJunchao Zhang 1979da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1980*12ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1981*12ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1982*12ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1983da112707SJunchao Zhang */ 1984*12ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1985*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 1986*12ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1987da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1988*12ba2bc6SJunchao Zhang } else { 1989*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M))); 1990*12ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1991da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1992*12ba2bc6SJunchao Zhang } 1993da112707SJunchao Zhang 1994da112707SJunchao Zhang /* ========================================================================== */ 1995da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1996da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1997da112707SJunchao Zhang /* ========================================================================== */ 1998da112707SJunchao Zhang int structural_zero; 1999da112707SJunchao Zhang cusparseStatus_t status; 2000da112707SJunchao Zhang 2001da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2002da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 2003da112707SJunchao Zhang fs->matDescr_M, 2004da112707SJunchao Zhang fs->csrVal, 2005da112707SJunchao Zhang fs->csrRowPtr, 2006da112707SJunchao Zhang fs->csrColIdx, 2007da112707SJunchao Zhang fs->ilu0Info_M, 2008da112707SJunchao Zhang fs->policy_M, 2009da112707SJunchao Zhang fs->factBuffer_M)); 2010da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2011da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2012da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 2013da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 2014da112707SJunchao Zhang } 2015da112707SJunchao Zhang 2016da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 20170dd8c0acSJunchao Zhang { 2018da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 20190dd8c0acSJunchao Zhang PetscInt *Ai,*Adiag,nzRow,nzLeft; 2020da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2021da112707SJunchao Zhang 2022da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2023da112707SJunchao Zhang Ai = Aseq->i; 2024da112707SJunchao Zhang Adiag = Aseq->diag; 2025da112707SJunchao Zhang for (PetscInt i=0; i<m; i++) { 2026da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2027da112707SJunchao Zhang nzRow = Ai[i+1] - Ai[i]; 2028da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 2029da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2030da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2031da112707SJunchao Zhang */ 2032da112707SJunchao Zhang nzLeft = (nzRow-1)/2; 2033da112707SJunchao Zhang flops += nzLeft*(2.0*nzRow-nzLeft+1); 2034da112707SJunchao Zhang } 2035da112707SJunchao Zhang } 2036da112707SJunchao Zhang fs->numericFactFlops = flops; 20370dd8c0acSJunchao Zhang } 2038da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2039da112707SJunchao Zhang PetscFunctionReturn(0); 2040da112707SJunchao Zhang } 2041da112707SJunchao Zhang 2042da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2043da112707SJunchao Zhang { 2044da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2045da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2046da112707SJunchao Zhang const PetscScalar *barray; 2047da112707SJunchao Zhang PetscScalar *xarray; 2048da112707SJunchao Zhang 2049da112707SJunchao Zhang PetscFunctionBegin; 2050da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2051da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 2052da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 2053da112707SJunchao Zhang 2054da112707SJunchao Zhang /* Solve L*y = b */ 2055da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2056da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2057da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2058da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2059da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2060da112707SJunchao Zhang fs->spMatDescr_L, /* L Y = X */ 2061da112707SJunchao Zhang fs->dnVecDescr_X, 2062da112707SJunchao Zhang fs->dnVecDescr_Y, 2063da112707SJunchao Zhang cusparse_scalartype, 2064da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2065da112707SJunchao Zhang fs->spsvDescr_L)); 2066da112707SJunchao Zhang 2067da112707SJunchao Zhang /* Solve Lt*x = y */ 2068da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2069da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2070da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2071da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2072da112707SJunchao Zhang fs->spMatDescr_L, /* Lt X = Y */ 2073da112707SJunchao Zhang fs->dnVecDescr_Y, 2074da112707SJunchao Zhang fs->dnVecDescr_X, 2075da112707SJunchao Zhang cusparse_scalartype, 2076da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2077da112707SJunchao Zhang fs->spsvDescr_Lt)); 2078da112707SJunchao Zhang 2079da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2080da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2081da112707SJunchao Zhang 2082da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 2083da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2084da112707SJunchao Zhang PetscFunctionReturn(0); 2085da112707SJunchao Zhang } 2086da112707SJunchao Zhang 2087da112707SJunchao Zhang static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2088da112707SJunchao Zhang { 2089da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2090da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2091da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2092da112707SJunchao Zhang CsrMatrix *Acsr; 2093da112707SJunchao Zhang PetscInt m,nz; 2094da112707SJunchao Zhang PetscBool flg; 2095da112707SJunchao Zhang 2096da112707SJunchao Zhang PetscFunctionBegin; 2097da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2098da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2099da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2100da112707SJunchao Zhang } 2101da112707SJunchao Zhang 2102da112707SJunchao Zhang /* Copy A's value to fact */ 2103da112707SJunchao Zhang m = fact->rmap->n; 2104da112707SJunchao Zhang nz = aij->nz; 2105da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2106da112707SJunchao Zhang Acsr = (CsrMatrix*)Acusp->mat->mat; 2107da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2108da112707SJunchao Zhang 2109da112707SJunchao Zhang /* Factorize fact inplace */ 2110da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2111da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2112da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2113da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2114da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2115da112707SJunchao Zhang */ 2116da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2117da112707SJunchao Zhang fs->matDescr_M, 2118da112707SJunchao Zhang fs->csrVal, 2119da112707SJunchao Zhang fs->csrRowPtr, 2120da112707SJunchao Zhang fs->csrColIdx, 2121da112707SJunchao Zhang fs->ic0Info_M, 2122da112707SJunchao Zhang fs->policy_M, 2123da112707SJunchao Zhang fs->factBuffer_M)); 2124da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2125da112707SJunchao Zhang int numerical_zero; 2126da112707SJunchao Zhang cusparseStatus_t status; 2127da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2128da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2129da112707SJunchao Zhang } 2130da112707SJunchao Zhang 2131da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2132da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2133da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2134da112707SJunchao Zhang fs->spMatDescr_L, 2135da112707SJunchao Zhang fs->dnVecDescr_X, 2136da112707SJunchao Zhang fs->dnVecDescr_Y, 2137da112707SJunchao Zhang cusparse_scalartype, 2138da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2139da112707SJunchao Zhang fs->spsvDescr_L, 2140da112707SJunchao Zhang fs->spsvBuffer_L)); 2141da112707SJunchao Zhang 2142da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2143da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2144da112707SJunchao Zhang */ 2145da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2146da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2147da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2148da112707SJunchao Zhang fs->spMatDescr_L, 2149da112707SJunchao Zhang fs->dnVecDescr_X, 2150da112707SJunchao Zhang fs->dnVecDescr_Y, 2151da112707SJunchao Zhang cusparse_scalartype, 2152da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2153da112707SJunchao Zhang fs->spsvDescr_Lt, 2154da112707SJunchao Zhang fs->spsvBuffer_Lt)); 2155da112707SJunchao Zhang 2156da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 2157da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2158da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2159da112707SJunchao Zhang fact->ops->matsolve = NULL; 2160da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 2161da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2162da112707SJunchao Zhang PetscFunctionReturn(0); 2163da112707SJunchao Zhang } 2164da112707SJunchao Zhang 2165da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2166da112707SJunchao Zhang { 2167da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2168da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2169da112707SJunchao Zhang PetscInt m,nz; 2170da112707SJunchao Zhang 2171da112707SJunchao Zhang PetscFunctionBegin; 2172da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2173da112707SJunchao Zhang PetscInt i; 2174da112707SJunchao Zhang PetscBool flg,missing; 2175da112707SJunchao Zhang 2176da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2177da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2178da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2179da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A,&missing,&i)); 2180da112707SJunchao Zhang PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2181da112707SJunchao Zhang } 2182da112707SJunchao Zhang 2183da112707SJunchao Zhang /* Free the old stale stuff */ 2184da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2185da112707SJunchao Zhang 2186da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2187da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2188da112707SJunchao Zhang */ 2189da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2190da112707SJunchao Zhang 2191da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2192da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2193da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2194da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2195da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2196da112707SJunchao Zhang 2197da112707SJunchao Zhang aij->row = NULL; 2198da112707SJunchao Zhang aij->col = NULL; 2199da112707SJunchao Zhang 2200da112707SJunchao Zhang /* ====================================================================== */ 2201da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2202da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2203da112707SJunchao Zhang /* ====================================================================== */ 2204da112707SJunchao Zhang const int *Ai,*Aj; 2205da112707SJunchao Zhang 2206da112707SJunchao Zhang m = fact->rmap->n; 2207da112707SJunchao Zhang nz = aij->nz; 2208da112707SJunchao Zhang 2209da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2210da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2211da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2212da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2213da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2214da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2215da112707SJunchao Zhang 2216da112707SJunchao Zhang /* ====================================================================== */ 2217da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2218da112707SJunchao Zhang /* ====================================================================== */ 2219da112707SJunchao Zhang cusparseFillMode_t fillMode; 2220da112707SJunchao Zhang cusparseDiagType_t diagType; 2221da112707SJunchao Zhang 2222da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2223da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2224da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2225da112707SJunchao Zhang 2226da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2227da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2228da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2229da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2230da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2231da112707SJunchao Zhang */ 2232da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2233da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2234da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2235da112707SJunchao Zhang fs->csrRowPtr, 2236da112707SJunchao Zhang fs->csrColIdx, 2237da112707SJunchao Zhang fs->csrVal, 2238da112707SJunchao Zhang CUSPARSE_INDEX_32I, 2239da112707SJunchao Zhang CUSPARSE_INDEX_32I, 2240da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 2241da112707SJunchao Zhang cusparse_scalartype)); 2242da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2243da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 2244da112707SJunchao Zhang &fillMode, 2245da112707SJunchao Zhang sizeof(fillMode))); 2246da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2247da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 2248da112707SJunchao Zhang &diagType, 2249da112707SJunchao Zhang sizeof(diagType))); 2250da112707SJunchao Zhang 2251da112707SJunchao Zhang /* ========================================================================= */ 2252da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2253da112707SJunchao Zhang /* ========================================================================= */ 2254da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2255da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2256da112707SJunchao Zhang fs->matDescr_M, 2257da112707SJunchao Zhang fs->csrVal, 2258da112707SJunchao Zhang fs->csrRowPtr, 2259da112707SJunchao Zhang fs->csrColIdx, 2260da112707SJunchao Zhang fs->ic0Info_M, 2261da112707SJunchao Zhang &fs->factBufferSize_M)); 2262da112707SJunchao Zhang 2263da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2264da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2265da112707SJunchao Zhang 2266da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2267da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2268da112707SJunchao Zhang 2269da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2270da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2271da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2272da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2273da112707SJunchao Zhang fs->spMatDescr_L, 2274da112707SJunchao Zhang fs->dnVecDescr_X, 2275da112707SJunchao Zhang fs->dnVecDescr_Y, 2276da112707SJunchao Zhang cusparse_scalartype, 2277da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2278da112707SJunchao Zhang fs->spsvDescr_L, 2279da112707SJunchao Zhang &fs->spsvBufferSize_L)); 2280da112707SJunchao Zhang 2281da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2282da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2283da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2284da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2285da112707SJunchao Zhang fs->spMatDescr_L, 2286da112707SJunchao Zhang fs->dnVecDescr_X, 2287da112707SJunchao Zhang fs->dnVecDescr_Y, 2288da112707SJunchao Zhang cusparse_scalartype, 2289da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2290da112707SJunchao Zhang fs->spsvDescr_Lt, 2291da112707SJunchao Zhang &fs->spsvBufferSize_Lt)); 2292da112707SJunchao Zhang 2293*12ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2294*12ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2295*12ba2bc6SJunchao Zhang */ 2296*12ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2297*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M))); 2298*12ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 2299da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2300*12ba2bc6SJunchao Zhang } else { 2301*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M))); 2302*12ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 2303*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2304*12ba2bc6SJunchao Zhang } 2305da112707SJunchao Zhang 2306da112707SJunchao Zhang /* ========================================================================== */ 2307da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2308da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2309da112707SJunchao Zhang /* ========================================================================== */ 2310da112707SJunchao Zhang int structural_zero; 2311da112707SJunchao Zhang cusparseStatus_t status; 2312da112707SJunchao Zhang 2313da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2314da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2315da112707SJunchao Zhang fs->matDescr_M, 2316da112707SJunchao Zhang fs->csrVal, 2317da112707SJunchao Zhang fs->csrRowPtr, 2318da112707SJunchao Zhang fs->csrColIdx, 2319da112707SJunchao Zhang fs->ic0Info_M, 2320da112707SJunchao Zhang fs->policy_M, 2321da112707SJunchao Zhang fs->factBuffer_M)); 2322da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2323da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2324da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2325da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2326da112707SJunchao Zhang } 2327da112707SJunchao Zhang 2328da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 23290dd8c0acSJunchao Zhang { 2330da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 23310dd8c0acSJunchao Zhang PetscInt *Ai,nzRow,nzLeft; 2332da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2333da112707SJunchao Zhang 2334da112707SJunchao Zhang Ai = Aseq->i; 2335da112707SJunchao Zhang for (PetscInt i=0; i<m; i++) { 2336da112707SJunchao Zhang nzRow = Ai[i+1] - Ai[i]; 2337da112707SJunchao Zhang if (nzRow > 1) { 2338da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2339da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2340da112707SJunchao Zhang */ 2341da112707SJunchao Zhang nzLeft = (nzRow-1)/2; 2342da112707SJunchao Zhang flops += nzLeft*(2.0*nzRow-nzLeft+1); 2343da112707SJunchao Zhang } 2344da112707SJunchao Zhang } 2345da112707SJunchao Zhang fs->numericFactFlops = flops; 23460dd8c0acSJunchao Zhang } 2347da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2348da112707SJunchao Zhang PetscFunctionReturn(0); 2349da112707SJunchao Zhang } 2350da112707SJunchao Zhang #endif 2351da112707SJunchao Zhang 2352da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2353da112707SJunchao Zhang { 2354da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2355da112707SJunchao Zhang 2356da112707SJunchao Zhang PetscFunctionBegin; 2357da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 2358da112707SJunchao Zhang PetscBool row_identity,col_identity; 2359da112707SJunchao Zhang PetscCall(ISIdentity(isrow,&row_identity)); 2360da112707SJunchao Zhang PetscCall(ISIdentity(iscol,&col_identity)); 2361da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2362da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2363da112707SJunchao Zhang } else 2364da112707SJunchao Zhang #endif 2365da112707SJunchao Zhang { 2366da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2367da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2368da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2369da112707SJunchao Zhang } 2370da112707SJunchao Zhang PetscFunctionReturn(0); 2371da112707SJunchao Zhang } 2372da112707SJunchao Zhang 2373da112707SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2374da112707SJunchao Zhang { 2375da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2376da112707SJunchao Zhang 2377da112707SJunchao Zhang PetscFunctionBegin; 2378da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2379da112707SJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2380da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2381da112707SJunchao Zhang PetscFunctionReturn(0); 2382da112707SJunchao Zhang } 2383da112707SJunchao Zhang 2384da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2385da112707SJunchao Zhang { 2386da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2387da112707SJunchao Zhang 2388da112707SJunchao Zhang PetscFunctionBegin; 2389da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 2390da112707SJunchao Zhang PetscBool perm_identity; 2391da112707SJunchao Zhang PetscCall(ISIdentity(perm,&perm_identity)); 2392da112707SJunchao Zhang if (!info->levels && perm_identity) { 2393da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2394da112707SJunchao Zhang } else 2395da112707SJunchao Zhang #endif 2396da112707SJunchao Zhang { 2397da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2398da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2399da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2400da112707SJunchao Zhang } 2401da112707SJunchao Zhang PetscFunctionReturn(0); 2402da112707SJunchao Zhang } 2403da112707SJunchao Zhang 2404da112707SJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2405da112707SJunchao Zhang { 2406da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2407da112707SJunchao Zhang 2408da112707SJunchao Zhang PetscFunctionBegin; 2409da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2410da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2411da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2412da112707SJunchao Zhang PetscFunctionReturn(0); 2413da112707SJunchao Zhang } 2414da112707SJunchao Zhang 2415841d4cb1SJunchao Zhang PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2416841d4cb1SJunchao Zhang { 2417841d4cb1SJunchao Zhang PetscFunctionBegin; 2418841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 2419841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2420841d4cb1SJunchao Zhang } 2421841d4cb1SJunchao Zhang 2422841d4cb1SJunchao Zhang /*MC 2423841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2424841d4cb1SJunchao Zhang on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2425841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2426841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2427841d4cb1SJunchao Zhang CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2428841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2429841d4cb1SJunchao Zhang 2430841d4cb1SJunchao Zhang Level: beginner 2431841d4cb1SJunchao Zhang 2432841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2433841d4cb1SJunchao Zhang M*/ 2434841d4cb1SJunchao Zhang 2435841d4cb1SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2436841d4cb1SJunchao Zhang { 2437841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2438841d4cb1SJunchao Zhang 2439841d4cb1SJunchao Zhang PetscFunctionBegin; 2440841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2441841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B,n,n,n,n)); 2442841d4cb1SJunchao Zhang (*B)->factortype = ftype; 2443841d4cb1SJunchao Zhang PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2444841d4cb1SJunchao Zhang 2445841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2446841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2447841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2448841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2449841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2450841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2451841d4cb1SJunchao Zhang } else { 2452841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2453841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2454841d4cb1SJunchao Zhang } 2455841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2456841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2457841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2458841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2459841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2460841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2461841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2462841d4cb1SJunchao Zhang } else { 2463841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2464841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2465841d4cb1SJunchao Zhang } 2466841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2467841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2468841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2469841d4cb1SJunchao Zhang 2470841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2471841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2472841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2473841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2474841d4cb1SJunchao Zhang } 2475841d4cb1SJunchao Zhang 24767e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 24777e8381f9SStefano Zampini { 24787e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 24797e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 24800dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500 2481da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 24820dd8c0acSJunchao Zhang #endif 24837e8381f9SStefano Zampini 24847e8381f9SStefano Zampini PetscFunctionBegin; 24857e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 24869566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2487da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2488da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 24899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2490da112707SJunchao Zhang } 2491da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500 2492da112707SJunchao Zhang else if (fs->csrVal) { 2493da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2494da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2495da112707SJunchao Zhang } 2496da112707SJunchao Zhang #endif 2497da112707SJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 24989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 24999566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 25007e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 25017e8381f9SStefano Zampini } 25027e8381f9SStefano Zampini PetscFunctionReturn(0); 25037e8381f9SStefano Zampini } 25047e8381f9SStefano Zampini 25057e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 25067e8381f9SStefano Zampini { 25077e8381f9SStefano Zampini PetscFunctionBegin; 25089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 250967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 251067a45760SJunchao Zhang PetscFunctionReturn(0); 251167a45760SJunchao Zhang } 251267a45760SJunchao Zhang 251367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 251467a45760SJunchao Zhang { 251567a45760SJunchao Zhang PetscFunctionBegin; 25167e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 251767a45760SJunchao Zhang *array = NULL; 251867a45760SJunchao Zhang PetscFunctionReturn(0); 251967a45760SJunchao Zhang } 252067a45760SJunchao Zhang 252167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 252267a45760SJunchao Zhang { 252367a45760SJunchao Zhang PetscFunctionBegin; 25249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 252567a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 252667a45760SJunchao Zhang PetscFunctionReturn(0); 252767a45760SJunchao Zhang } 252867a45760SJunchao Zhang 252967a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 253067a45760SJunchao Zhang { 253167a45760SJunchao Zhang PetscFunctionBegin; 253267a45760SJunchao Zhang *array = NULL; 253367a45760SJunchao Zhang PetscFunctionReturn(0); 253467a45760SJunchao Zhang } 253567a45760SJunchao Zhang 253667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 253767a45760SJunchao Zhang { 253867a45760SJunchao Zhang PetscFunctionBegin; 253967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 254067a45760SJunchao Zhang PetscFunctionReturn(0); 254167a45760SJunchao Zhang } 254267a45760SJunchao Zhang 254367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 254467a45760SJunchao Zhang { 254567a45760SJunchao Zhang PetscFunctionBegin; 254667a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 254767a45760SJunchao Zhang *array = NULL; 25487e8381f9SStefano Zampini PetscFunctionReturn(0); 25497e8381f9SStefano Zampini } 25507e8381f9SStefano Zampini 25517ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 25527ee59b9bSJunchao Zhang { 25537ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 25547ee59b9bSJunchao Zhang CsrMatrix *matrix; 25557ee59b9bSJunchao Zhang 25567ee59b9bSJunchao Zhang PetscFunctionBegin; 25577ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 25587ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 25597ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 25607ee59b9bSJunchao Zhang PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 25617ee59b9bSJunchao Zhang matrix = (CsrMatrix*)cusp->mat->mat; 25627ee59b9bSJunchao Zhang 25637ee59b9bSJunchao Zhang if (i) { 25647ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 25657ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 25667ee59b9bSJunchao Zhang #else 25677ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 25687ee59b9bSJunchao Zhang #endif 25697ee59b9bSJunchao Zhang } 25707ee59b9bSJunchao Zhang if (j) { 25717ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 25727ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 25737ee59b9bSJunchao Zhang #else 25747ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 25757ee59b9bSJunchao Zhang #endif 25767ee59b9bSJunchao Zhang } 25777ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 25787ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 25797ee59b9bSJunchao Zhang PetscFunctionReturn(0); 25807ee59b9bSJunchao Zhang } 25817ee59b9bSJunchao Zhang 2582042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 25839ae82921SPaul Mullowney { 2584aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 25857c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 25869ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2587213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2588aa372e3fSPaul Mullowney cusparseStatus_t stat; 2589abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 25909ae82921SPaul Mullowney 25919ae82921SPaul Mullowney PetscFunctionBegin; 259228b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2593c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2594a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2595a49f1ed0SStefano Zampini CsrMatrix *matrix; 2596afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 259785ba7357SStefano Zampini 259808401ef6SPierre Jolivet PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 25999566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2600afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 26019566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 26029566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 26039566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 26049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 260534d6c7a5SJose E. Roman } else { 2606abb89eb1SStefano Zampini PetscInt nnz; 26079566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 26089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 26099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 26107c700b8dSJunchao Zhang delete cusparsestruct->workVector; 261181902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2612a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2613a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 26149ae82921SPaul Mullowney try { 26159ae82921SPaul Mullowney if (a->compressedrow.use) { 26169ae82921SPaul Mullowney m = a->compressedrow.nrows; 26179ae82921SPaul Mullowney ii = a->compressedrow.i; 26189ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 26199ae82921SPaul Mullowney } else { 2620213423ffSJunchao Zhang m = A->rmap->n; 2621213423ffSJunchao Zhang ii = a->i; 2622e6e9a74fSStefano Zampini ridx = NULL; 26239ae82921SPaul Mullowney } 262408401ef6SPierre Jolivet PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2625abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2626abb89eb1SStefano Zampini else nnz = a->nz; 262708401ef6SPierre Jolivet PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 26289ae82921SPaul Mullowney 262985ba7357SStefano Zampini /* create cusparse matrix */ 2630abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2631aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 26329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 26339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 26349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 26359ae82921SPaul Mullowney 26369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 26379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 26389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 26399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2643b06137fdSPaul Mullowney 2644aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2645aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2646aa372e3fSPaul Mullowney /* set the matrix */ 2647afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 2648afb2bd1cSJunchao Zhang mat->num_rows = m; 2649afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2650abb89eb1SStefano Zampini mat->num_entries = nnz; 2651afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 2652afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 26539ae82921SPaul Mullowney 2654abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2655abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 2656aa372e3fSPaul Mullowney 2657abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2658abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 2659aa372e3fSPaul Mullowney 2660aa372e3fSPaul Mullowney /* assign the pointer */ 2661afb2bd1cSJunchao Zhang matstruct->mat = mat; 2662afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2663afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2664afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 2665afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 2666afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 2667afb2bd1cSJunchao Zhang mat->values->data().get(), 2668afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 26699566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2670afb2bd1cSJunchao Zhang } 2671afb2bd1cSJunchao Zhang #endif 2672aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2673afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2674afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2675afb2bd1cSJunchao Zhang #else 2676afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 2677afb2bd1cSJunchao Zhang mat->num_rows = m; 2678afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2679abb89eb1SStefano Zampini mat->num_entries = nnz; 2680afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 2681afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 2682aa372e3fSPaul Mullowney 2683abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2684abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 2685aa372e3fSPaul Mullowney 2686abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2687abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 2688aa372e3fSPaul Mullowney 2689aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 26909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2691aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2692aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2693afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2694afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 2695afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 2696afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 26979566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 2698aa372e3fSPaul Mullowney /* assign the pointer */ 2699aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2700aa372e3fSPaul Mullowney 2701afb2bd1cSJunchao Zhang if (mat) { 2702afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 2703afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2704afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2705afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 2706087f3262SPaul Mullowney } 2707afb2bd1cSJunchao Zhang #endif 2708087f3262SPaul Mullowney } 2709ca45077fSPaul Mullowney 2710aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2711213423ffSJunchao Zhang if (a->compressedrow.use) { 2712213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2713aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2714aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 2715213423ffSJunchao Zhang tmp = m; 2716213423ffSJunchao Zhang } else { 2717213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2718213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2719213423ffSJunchao Zhang tmp = 0; 2720213423ffSJunchao Zhang } 27219566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2722aa372e3fSPaul Mullowney 2723aa372e3fSPaul Mullowney /* assign the pointer */ 2724aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 27259ae82921SPaul Mullowney } catch(char *ex) { 272698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 27279ae82921SPaul Mullowney } 27289566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 27299566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 273034d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 273134d6c7a5SJose E. Roman } 2732abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 27339ae82921SPaul Mullowney } 27349ae82921SPaul Mullowney PetscFunctionReturn(0); 27359ae82921SPaul Mullowney } 27369ae82921SPaul Mullowney 2737c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 2738aa372e3fSPaul Mullowney { 2739aa372e3fSPaul Mullowney template <typename Tuple> 2740aa372e3fSPaul Mullowney __host__ __device__ 2741aa372e3fSPaul Mullowney void operator()(Tuple t) 2742aa372e3fSPaul Mullowney { 2743aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2744aa372e3fSPaul Mullowney } 2745aa372e3fSPaul Mullowney }; 2746aa372e3fSPaul Mullowney 27477e8381f9SStefano Zampini struct VecCUDAEquals 27487e8381f9SStefano Zampini { 27497e8381f9SStefano Zampini template <typename Tuple> 27507e8381f9SStefano Zampini __host__ __device__ 27517e8381f9SStefano Zampini void operator()(Tuple t) 27527e8381f9SStefano Zampini { 27537e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 27547e8381f9SStefano Zampini } 27557e8381f9SStefano Zampini }; 27567e8381f9SStefano Zampini 2757e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 2758e6e9a74fSStefano Zampini { 2759e6e9a74fSStefano Zampini template <typename Tuple> 2760e6e9a74fSStefano Zampini __host__ __device__ 2761e6e9a74fSStefano Zampini void operator()(Tuple t) 2762e6e9a74fSStefano Zampini { 2763e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2764e6e9a74fSStefano Zampini } 2765e6e9a74fSStefano Zampini }; 2766e6e9a74fSStefano Zampini 2767afb2bd1cSJunchao Zhang struct MatMatCusparse { 2768ccdfe979SStefano Zampini PetscBool cisdense; 2769ccdfe979SStefano Zampini PetscScalar *Bt; 2770ccdfe979SStefano Zampini Mat X; 2771fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2772fcdce8c4SStefano Zampini PetscLogDouble flops; 2773fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2774b4285af6SJunchao Zhang 2775afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2776fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2777afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2778afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2779afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2780afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2781b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2782b4285af6SJunchao Zhang void *dBuffer4; 2783b4285af6SJunchao Zhang void *dBuffer5; 2784b4285af6SJunchao Zhang #endif 2785fcdce8c4SStefano Zampini size_t mmBufferSize; 2786fcdce8c4SStefano Zampini void *mmBuffer; 2787fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2788fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2789afb2bd1cSJunchao Zhang #endif 2790afb2bd1cSJunchao Zhang }; 2791ccdfe979SStefano Zampini 2792ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2793ccdfe979SStefano Zampini { 2794ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2795ccdfe979SStefano Zampini 2796ccdfe979SStefano Zampini PetscFunctionBegin; 27979566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2798fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2799afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 28009566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 28019566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 28029566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 28039566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2804b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 28059566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 28069566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2807b4285af6SJunchao Zhang #endif 28089566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 28099566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2810afb2bd1cSJunchao Zhang #endif 28119566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 28129566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2813ccdfe979SStefano Zampini PetscFunctionReturn(0); 2814ccdfe979SStefano Zampini } 2815ccdfe979SStefano Zampini 2816ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2817ccdfe979SStefano Zampini 2818ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2819ccdfe979SStefano Zampini { 2820ccdfe979SStefano Zampini Mat_Product *product = C->product; 2821ccdfe979SStefano Zampini Mat A,B; 2822afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2823ccdfe979SStefano Zampini PetscBool flg,biscuda; 2824ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2825ccdfe979SStefano Zampini cusparseStatus_t stat; 2826ccdfe979SStefano Zampini cusparseOperation_t opA; 2827ccdfe979SStefano Zampini const PetscScalar *barray; 2828ccdfe979SStefano Zampini PetscScalar *carray; 2829ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2830ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2831ccdfe979SStefano Zampini CsrMatrix *csrmat; 2832ccdfe979SStefano Zampini 2833ccdfe979SStefano Zampini PetscFunctionBegin; 2834ccdfe979SStefano Zampini MatCheckProduct(C,1); 283528b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2836ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2837ccdfe979SStefano Zampini A = product->A; 2838ccdfe979SStefano Zampini B = product->B; 28399566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 284028b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2841ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2842ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 284328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 28449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2845ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2846ccdfe979SStefano Zampini switch (product->type) { 2847ccdfe979SStefano Zampini case MATPRODUCT_AB: 2848ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2849ccdfe979SStefano Zampini mat = cusp->mat; 2850ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2851ccdfe979SStefano Zampini m = A->rmap->n; 2852ccdfe979SStefano Zampini n = B->cmap->n; 2853ccdfe979SStefano Zampini break; 2854ccdfe979SStefano Zampini case MATPRODUCT_AtB: 28551a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2856e6e9a74fSStefano Zampini mat = cusp->mat; 2857e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2858e6e9a74fSStefano Zampini } else { 28599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2860ccdfe979SStefano Zampini mat = cusp->matTranspose; 2861ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2862e6e9a74fSStefano Zampini } 2863ccdfe979SStefano Zampini m = A->cmap->n; 2864ccdfe979SStefano Zampini n = B->cmap->n; 2865ccdfe979SStefano Zampini break; 2866ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2867ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2868ccdfe979SStefano Zampini mat = cusp->mat; 2869ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2870ccdfe979SStefano Zampini m = A->rmap->n; 2871ccdfe979SStefano Zampini n = B->rmap->n; 2872ccdfe979SStefano Zampini break; 2873ccdfe979SStefano Zampini default: 287498921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2875ccdfe979SStefano Zampini } 287628b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2877ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2878ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 28799566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 28809566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 28819566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2882afb2bd1cSJunchao Zhang 28839566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B,&blda)); 2884c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 28859566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 28869566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2887c8378d12SStefano Zampini } else { 28889566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 28899566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C,&clda)); 2890c8378d12SStefano Zampini } 2891c8378d12SStefano Zampini 28929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2893afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2894afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2895a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2896afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2897fcdce8c4SStefano Zampini size_t mmBufferSize; 28989566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2899afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 29009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2901afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2902afb2bd1cSJunchao Zhang } 2903c8378d12SStefano Zampini 29049566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2905afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 29069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2907afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2908afb2bd1cSJunchao Zhang } 2909afb2bd1cSJunchao Zhang 2910afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2911afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2912afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2913afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2914afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2915afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 29169566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2917afb2bd1cSJunchao Zhang } 2918afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2919afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2920afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 29219566063dSJacob Faibussowitsch cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2922fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 29239566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 29249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2925fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2926fcdce8c4SStefano Zampini } 2927afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2928afb2bd1cSJunchao Zhang } else { 2929afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 29309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 29319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 29329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2933afb2bd1cSJunchao Zhang } 2934afb2bd1cSJunchao Zhang 2935afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2936afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2937afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2938afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 29399566063dSJacob Faibussowitsch cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2940afb2bd1cSJunchao Zhang #else 2941afb2bd1cSJunchao Zhang PetscInt k; 2942afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2943ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2944ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2945ccdfe979SStefano Zampini cublasStatus_t cerr; 2946ccdfe979SStefano Zampini 29479566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2948ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2949ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2950ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2951ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 29529566063dSJacob Faibussowitsch mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2953ccdfe979SStefano Zampini blda = B->cmap->n; 2954afb2bd1cSJunchao Zhang k = B->cmap->n; 2955afb2bd1cSJunchao Zhang } else { 2956afb2bd1cSJunchao Zhang k = B->rmap->n; 2957ccdfe979SStefano Zampini } 2958ccdfe979SStefano Zampini 2959afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2960ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2961afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2962ccdfe979SStefano Zampini csrmat->values->data().get(), 2963ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2964ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2965ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 29669566063dSJacob Faibussowitsch carray,clda);PetscCallCUSPARSE(stat); 2967afb2bd1cSJunchao Zhang #endif 29689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 29699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 29709566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2971ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 29729566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 29739566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2974ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 29759566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 29769566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2977ccdfe979SStefano Zampini } else { 29789566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2979ccdfe979SStefano Zampini } 2980ccdfe979SStefano Zampini if (mmdata->cisdense) { 29819566063dSJacob Faibussowitsch PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2982ccdfe979SStefano Zampini } 2983ccdfe979SStefano Zampini if (!biscuda) { 29849566063dSJacob Faibussowitsch PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2985ccdfe979SStefano Zampini } 2986ccdfe979SStefano Zampini PetscFunctionReturn(0); 2987ccdfe979SStefano Zampini } 2988ccdfe979SStefano Zampini 2989ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2990ccdfe979SStefano Zampini { 2991ccdfe979SStefano Zampini Mat_Product *product = C->product; 2992ccdfe979SStefano Zampini Mat A,B; 2993ccdfe979SStefano Zampini PetscInt m,n; 2994ccdfe979SStefano Zampini PetscBool cisdense,flg; 2995ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2996ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2997ccdfe979SStefano Zampini 2998ccdfe979SStefano Zampini PetscFunctionBegin; 2999ccdfe979SStefano Zampini MatCheckProduct(C,1); 300028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3001ccdfe979SStefano Zampini A = product->A; 3002ccdfe979SStefano Zampini B = product->B; 30039566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 300428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 3005ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 300608401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3007ccdfe979SStefano Zampini switch (product->type) { 3008ccdfe979SStefano Zampini case MATPRODUCT_AB: 3009ccdfe979SStefano Zampini m = A->rmap->n; 3010ccdfe979SStefano Zampini n = B->cmap->n; 3011ccdfe979SStefano Zampini break; 3012ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3013ccdfe979SStefano Zampini m = A->cmap->n; 3014ccdfe979SStefano Zampini n = B->cmap->n; 3015ccdfe979SStefano Zampini break; 3016ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3017ccdfe979SStefano Zampini m = A->rmap->n; 3018ccdfe979SStefano Zampini n = B->rmap->n; 3019ccdfe979SStefano Zampini break; 3020ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3021ccdfe979SStefano Zampini m = B->cmap->n; 3022ccdfe979SStefano Zampini n = B->cmap->n; 3023ccdfe979SStefano Zampini break; 3024ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3025ccdfe979SStefano Zampini m = B->rmap->n; 3026ccdfe979SStefano Zampini n = B->rmap->n; 3027ccdfe979SStefano Zampini break; 3028ccdfe979SStefano Zampini default: 302998921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3030ccdfe979SStefano Zampini } 30319566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 3032ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 30339566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 30349566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3035ccdfe979SStefano Zampini 3036ccdfe979SStefano Zampini /* product data */ 30379566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3038ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 3039afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3040afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3041ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 30429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3043ccdfe979SStefano Zampini } 3044afb2bd1cSJunchao Zhang #endif 3045ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 3046ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 30479566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 30489566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3049ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 30509566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3051ccdfe979SStefano Zampini } else { 30529566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3053ccdfe979SStefano Zampini } 3054ccdfe979SStefano Zampini } 3055ccdfe979SStefano Zampini C->product->data = mmdata; 3056ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3057ccdfe979SStefano Zampini 3058ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3059ccdfe979SStefano Zampini PetscFunctionReturn(0); 3060ccdfe979SStefano Zampini } 3061ccdfe979SStefano Zampini 3062fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3063ccdfe979SStefano Zampini { 3064ccdfe979SStefano Zampini Mat_Product *product = C->product; 3065fcdce8c4SStefano Zampini Mat A,B; 3066fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3067fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3068fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3069fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3070fcdce8c4SStefano Zampini PetscBool flg; 3071fcdce8c4SStefano Zampini cusparseStatus_t stat; 3072fcdce8c4SStefano Zampini MatProductType ptype; 3073fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3074fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3075fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3076fcdce8c4SStefano Zampini #endif 3077b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3078ccdfe979SStefano Zampini 3079ccdfe979SStefano Zampini PetscFunctionBegin; 3080ccdfe979SStefano Zampini MatCheckProduct(C,1); 308128b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 30829566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 308328b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3084fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 3085fcdce8c4SStefano Zampini A = product->A; 3086fcdce8c4SStefano Zampini B = product->B; 3087fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3088fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 3089fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 309008401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3091fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 309228b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3093fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 309428b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3095fcdce8c4SStefano Zampini goto finalize; 3096fcdce8c4SStefano Zampini } 3097fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 30989566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 309928b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 31009566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 310128b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 310228b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 310328b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3104fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3105fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3106fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 310708401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 310808401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 310908401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 31109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 31119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3112fcdce8c4SStefano Zampini 3113fcdce8c4SStefano Zampini ptype = product->type; 3114fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 3115fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 311628b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3117fa046f9fSJunchao Zhang } 3118fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 3119fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 312028b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3121fa046f9fSJunchao Zhang } 3122fcdce8c4SStefano Zampini switch (ptype) { 3123fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3124fcdce8c4SStefano Zampini Amat = Acusp->mat; 3125fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3126fcdce8c4SStefano Zampini break; 3127fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3128fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3129fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3130fcdce8c4SStefano Zampini break; 3131fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3132fcdce8c4SStefano Zampini Amat = Acusp->mat; 3133fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3134fcdce8c4SStefano Zampini break; 3135fcdce8c4SStefano Zampini default: 313698921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3137fcdce8c4SStefano Zampini } 3138fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 313928b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 314028b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 314128b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3142fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 3143fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3144fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 314528b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 314628b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 314728b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 31489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3149fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3150fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 31519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3152b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3153b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3154b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3155b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 31569566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3157b4285af6SJunchao Zhang #else 3158b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3159fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3160fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 31619566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3162b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3163fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 31649566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3165b4285af6SJunchao Zhang #endif 3166fcdce8c4SStefano Zampini #else 3167b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3168fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3169fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3170fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 31719566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3172fcdce8c4SStefano Zampini #endif 31739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 31749566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 31759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3176fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3177fcdce8c4SStefano Zampini finalize: 3178fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 31799566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 31809566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 31819566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3182fcdce8c4SStefano Zampini c->reallocs = 0; 3183fcdce8c4SStefano Zampini C->info.mallocs += 0; 3184fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 3185fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 3186fcdce8c4SStefano Zampini C->num_ass++; 3187ccdfe979SStefano Zampini PetscFunctionReturn(0); 3188ccdfe979SStefano Zampini } 3189fcdce8c4SStefano Zampini 3190fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3191fcdce8c4SStefano Zampini { 3192fcdce8c4SStefano Zampini Mat_Product *product = C->product; 3193fcdce8c4SStefano Zampini Mat A,B; 3194fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3195fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 3196fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3197fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3198fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 3199fcdce8c4SStefano Zampini PetscBool flg; 3200fcdce8c4SStefano Zampini cusparseStatus_t stat; 3201fcdce8c4SStefano Zampini MatProductType ptype; 3202fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3203fcdce8c4SStefano Zampini PetscLogDouble flops; 3204fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 3205fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3206fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3207fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3208fcdce8c4SStefano Zampini #else 3209fcdce8c4SStefano Zampini int cnz; 3210fcdce8c4SStefano Zampini #endif 3211b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3212fcdce8c4SStefano Zampini 3213fcdce8c4SStefano Zampini PetscFunctionBegin; 3214fcdce8c4SStefano Zampini MatCheckProduct(C,1); 321528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3216fcdce8c4SStefano Zampini A = product->A; 3217fcdce8c4SStefano Zampini B = product->B; 32189566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 321928b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 32209566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 322128b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3222fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 3223fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 3224fcdce8c4SStefano Zampini /* product data */ 32259566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3226fcdce8c4SStefano Zampini C->product->data = mmdata; 3227fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3228fcdce8c4SStefano Zampini 32299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 32309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3231d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3232d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 323308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 323408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3235d60bce21SJunchao Zhang 3236fcdce8c4SStefano Zampini ptype = product->type; 3237fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 3238fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3239fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3240fa046f9fSJunchao Zhang } 3241fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 3242fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3243fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3244fa046f9fSJunchao Zhang } 3245fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3246fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3247fcdce8c4SStefano Zampini switch (ptype) { 3248fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3249fcdce8c4SStefano Zampini m = A->rmap->n; 3250fcdce8c4SStefano Zampini n = B->cmap->n; 3251fcdce8c4SStefano Zampini k = A->cmap->n; 3252fcdce8c4SStefano Zampini Amat = Acusp->mat; 3253fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3254fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3255fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3256fcdce8c4SStefano Zampini break; 3257fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3258fcdce8c4SStefano Zampini m = A->cmap->n; 3259fcdce8c4SStefano Zampini n = B->cmap->n; 3260fcdce8c4SStefano Zampini k = A->rmap->n; 32619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3262fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3263fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3264fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3265fcdce8c4SStefano Zampini break; 3266fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3267fcdce8c4SStefano Zampini m = A->rmap->n; 3268fcdce8c4SStefano Zampini n = B->rmap->n; 3269fcdce8c4SStefano Zampini k = A->cmap->n; 32709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3271fcdce8c4SStefano Zampini Amat = Acusp->mat; 3272fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3273fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3274fcdce8c4SStefano Zampini break; 3275fcdce8c4SStefano Zampini default: 327698921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3277fcdce8c4SStefano Zampini } 3278fcdce8c4SStefano Zampini 3279fcdce8c4SStefano Zampini /* create cusparse matrix */ 32809566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 32819566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3282fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 3283fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3284fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3285fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3286fcdce8c4SStefano Zampini 3287fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3288fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3289fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 32909566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 32919566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3292fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3293fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3294fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3295fcdce8c4SStefano Zampini } else { 3296fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3297fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3298fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3299fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3300fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3301fcdce8c4SStefano Zampini } 3302fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3303fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3304fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3305fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3306fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3307fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 33089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 33099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 33109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 33119566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 33129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 33139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 33149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 33159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 33169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3317fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3318fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3319fcdce8c4SStefano Zampini c->nz = 0; 3320fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3321fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3322fcdce8c4SStefano Zampini goto finalizesym; 3323fcdce8c4SStefano Zampini } 3324fcdce8c4SStefano Zampini 332528b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 332628b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3327fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 3328fcdce8c4SStefano Zampini if (!biscompressed) { 3329fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 3330fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3331fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3332fcdce8c4SStefano Zampini #endif 3333fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3334fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3335fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3336fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3337fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3338fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3339fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3340fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3341fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3342fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3343fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 33449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3345fcdce8c4SStefano Zampini } 3346fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3347fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3348fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3349fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 3350fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3351fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3352fcdce8c4SStefano Zampini Bcsr->values->data().get(), 3353fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 33549566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3355fcdce8c4SStefano Zampini } 3356fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3357fcdce8c4SStefano Zampini #endif 3358fcdce8c4SStefano Zampini } 335928b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 336028b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3361fcdce8c4SStefano Zampini /* precompute flops count */ 3362fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3363fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 3364fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3365fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 3366fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 3367fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3368fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 3369fcdce8c4SStefano Zampini } 3370fcdce8c4SStefano Zampini } 3371fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3372fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 3373fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 3374fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 3375fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 3376fcdce8c4SStefano Zampini } 3377fcdce8c4SStefano Zampini } else { /* TODO */ 3378fcdce8c4SStefano Zampini flops = 0.; 3379fcdce8c4SStefano Zampini } 3380fcdce8c4SStefano Zampini 3381fcdce8c4SStefano Zampini mmdata->flops = flops; 33829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3383b4285af6SJunchao Zhang 3384fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33859566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3386fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3387fcdce8c4SStefano Zampini NULL, NULL, NULL, 3388fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 33899566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 33909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3391b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3392b4285af6SJunchao Zhang { 3393b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3394b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3395b4285af6SJunchao Zhang */ 3396b4285af6SJunchao Zhang void* dBuffer1 = NULL; 3397b4285af6SJunchao Zhang void* dBuffer2 = NULL; 3398b4285af6SJunchao Zhang void* dBuffer3 = NULL; 3399b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3400b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3401b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3402b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3403b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3404b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3405b4285af6SJunchao Zhang 3406b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3407b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 3408b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3409b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34109566063dSJacob Faibussowitsch &bufferSize1, NULL);PetscCallCUSPARSE(stat); 34119566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3412b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 3413b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3414b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34159566063dSJacob Faibussowitsch &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3416b4285af6SJunchao Zhang 3417b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3418b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3419b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34209566063dSJacob Faibussowitsch &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 34219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 34229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 34239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3424b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3425b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34269566063dSJacob Faibussowitsch &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 34279566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 34289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3429b4285af6SJunchao Zhang 3430b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3431b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 34329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3433b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 3434b4285af6SJunchao Zhang /* allocate matrix C */ 34359566063dSJacob Faibussowitsch Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 34369566063dSJacob Faibussowitsch Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3437b4285af6SJunchao Zhang /* update matC with the new pointers */ 3438b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 34399566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3440b4285af6SJunchao Zhang 3441b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3442b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3443b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34449566063dSJacob Faibussowitsch &bufferSize5, NULL);PetscCallCUSPARSE(stat); 34459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3446b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3447b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34489566063dSJacob Faibussowitsch &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 34499566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 3450b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3451b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3452b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34539566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 34549566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3455b4285af6SJunchao Zhang } 3456ae37ee31SJunchao Zhang #else 3457b4285af6SJunchao Zhang size_t bufSize2; 3458fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 3459b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3460fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3461fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34629566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 34639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3464fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 3465b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3466fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3467fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34689566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3469fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 3470b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3471fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3472fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34739566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3474fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3475fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3476fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3477fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3478fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 34799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3480fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 3481b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3482fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3483fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34849566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3485fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 34869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3487fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 34889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3489fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 34909566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3491fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 34929566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3493fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 34949566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3495b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3496fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 34979566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3498ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3499fcdce8c4SStefano Zampini #else 35009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3501b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3502fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3503fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3504fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 35059566063dSJacob Faibussowitsch Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3506fcdce8c4SStefano Zampini c->nz = cnz; 3507fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 35089566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3509fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 35109566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3511fcdce8c4SStefano Zampini 35129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3513fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3514fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3515fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3516b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3517fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3518fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3519fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 35209566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3521fcdce8c4SStefano Zampini #endif 35229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 35239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3524fcdce8c4SStefano Zampini finalizesym: 3525fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3526fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3527fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 35289566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 35299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 3530fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3531fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3532fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3533fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3534fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3535fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3536fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 35379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 35389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3539fcdce8c4SStefano Zampini } else { 3540fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3541fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 35429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 35439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3544fcdce8c4SStefano Zampini } 3545fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3546fcdce8c4SStefano Zampini PetscInt r = 0; 3547fcdce8c4SStefano Zampini c->i[0] = 0; 3548fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3549fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3550fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3551fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 3552fcdce8c4SStefano Zampini } 3553fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3554fcdce8c4SStefano Zampini } 35559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 35569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 35579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 3558fcdce8c4SStefano Zampini c->maxnz = c->nz; 3559fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3560fcdce8c4SStefano Zampini c->rmax = 0; 3561fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3562fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 3563fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3564fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3565fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 3566fcdce8c4SStefano Zampini } 35679566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 35689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 3569fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3570fcdce8c4SStefano Zampini 3571fcdce8c4SStefano Zampini C->nonzerostate++; 35729566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 35739566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3574fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3575fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3576fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3577fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3578fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3579abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3580fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3581fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3582fcdce8c4SStefano Zampini } 3583fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3584fcdce8c4SStefano Zampini PetscFunctionReturn(0); 3585fcdce8c4SStefano Zampini } 3586fcdce8c4SStefano Zampini 3587fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3588fcdce8c4SStefano Zampini 3589fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3590fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3591fcdce8c4SStefano Zampini { 3592fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3593fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3594fcdce8c4SStefano Zampini 3595fcdce8c4SStefano Zampini PetscFunctionBegin; 3596fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 35979566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3598abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 35999566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3600fcdce8c4SStefano Zampini } 3601fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3602fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 3603fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 36049566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3605fcdce8c4SStefano Zampini } 3606fcdce8c4SStefano Zampini } 360765e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 360865e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 360965e4b4d4SStefano Zampini switch (product->type) { 361065e4b4d4SStefano Zampini case MATPRODUCT_AB: 361165e4b4d4SStefano Zampini if (product->api_user) { 3612d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 36139566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3614d0609cedSBarry Smith PetscOptionsEnd(); 361565e4b4d4SStefano Zampini } else { 3616d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 36179566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3618d0609cedSBarry Smith PetscOptionsEnd(); 361965e4b4d4SStefano Zampini } 362065e4b4d4SStefano Zampini break; 362165e4b4d4SStefano Zampini case MATPRODUCT_AtB: 362265e4b4d4SStefano Zampini if (product->api_user) { 3623d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 36249566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3625d0609cedSBarry Smith PetscOptionsEnd(); 362665e4b4d4SStefano Zampini } else { 3627d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 36289566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3629d0609cedSBarry Smith PetscOptionsEnd(); 363065e4b4d4SStefano Zampini } 363165e4b4d4SStefano Zampini break; 363265e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 363365e4b4d4SStefano Zampini if (product->api_user) { 3634d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 36359566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3636d0609cedSBarry Smith PetscOptionsEnd(); 363765e4b4d4SStefano Zampini } else { 3638d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 36399566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3640d0609cedSBarry Smith PetscOptionsEnd(); 364165e4b4d4SStefano Zampini } 364265e4b4d4SStefano Zampini break; 364365e4b4d4SStefano Zampini case MATPRODUCT_RARt: 364465e4b4d4SStefano Zampini if (product->api_user) { 3645d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 36469566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3647d0609cedSBarry Smith PetscOptionsEnd(); 364865e4b4d4SStefano Zampini } else { 3649d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 36509566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3651d0609cedSBarry Smith PetscOptionsEnd(); 365265e4b4d4SStefano Zampini } 365365e4b4d4SStefano Zampini break; 365465e4b4d4SStefano Zampini case MATPRODUCT_ABC: 365565e4b4d4SStefano Zampini if (product->api_user) { 3656d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 36579566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3658d0609cedSBarry Smith PetscOptionsEnd(); 365965e4b4d4SStefano Zampini } else { 3660d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 36619566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3662d0609cedSBarry Smith PetscOptionsEnd(); 366365e4b4d4SStefano Zampini } 366465e4b4d4SStefano Zampini break; 366565e4b4d4SStefano Zampini default: 366665e4b4d4SStefano Zampini break; 366765e4b4d4SStefano Zampini } 366865e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 366965e4b4d4SStefano Zampini } 367065e4b4d4SStefano Zampini /* dispatch */ 3671fcdce8c4SStefano Zampini if (isdense) { 3672ccdfe979SStefano Zampini switch (product->type) { 3673ccdfe979SStefano Zampini case MATPRODUCT_AB: 3674ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3675ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3676ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3677ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3678fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 36799566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3680fcdce8c4SStefano Zampini } else { 3681fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3682fcdce8c4SStefano Zampini } 3683fcdce8c4SStefano Zampini break; 3684fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 3685fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3686fcdce8c4SStefano Zampini break; 3687ccdfe979SStefano Zampini default: 3688ccdfe979SStefano Zampini break; 3689ccdfe979SStefano Zampini } 3690fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3691fcdce8c4SStefano Zampini switch (product->type) { 3692fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3693fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3694fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3695fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3696fcdce8c4SStefano Zampini break; 3697fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3698fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3699fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 3700fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3701fcdce8c4SStefano Zampini break; 3702fcdce8c4SStefano Zampini default: 3703fcdce8c4SStefano Zampini break; 3704fcdce8c4SStefano Zampini } 3705fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 37069566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3707fcdce8c4SStefano Zampini } 3708ccdfe979SStefano Zampini PetscFunctionReturn(0); 3709ccdfe979SStefano Zampini } 3710ccdfe979SStefano Zampini 37116fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 37129ae82921SPaul Mullowney { 37139ae82921SPaul Mullowney PetscFunctionBegin; 37149566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3715e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3716e6e9a74fSStefano Zampini } 3717e6e9a74fSStefano Zampini 3718e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3719e6e9a74fSStefano Zampini { 3720e6e9a74fSStefano Zampini PetscFunctionBegin; 37219566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3722e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3723e6e9a74fSStefano Zampini } 3724e6e9a74fSStefano Zampini 3725e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3726e6e9a74fSStefano Zampini { 3727e6e9a74fSStefano Zampini PetscFunctionBegin; 37289566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3729e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3730e6e9a74fSStefano Zampini } 3731e6e9a74fSStefano Zampini 3732e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3733e6e9a74fSStefano Zampini { 3734e6e9a74fSStefano Zampini PetscFunctionBegin; 37359566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 37369ae82921SPaul Mullowney PetscFunctionReturn(0); 37379ae82921SPaul Mullowney } 37389ae82921SPaul Mullowney 37396fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3740ca45077fSPaul Mullowney { 3741ca45077fSPaul Mullowney PetscFunctionBegin; 37429566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3743ca45077fSPaul Mullowney PetscFunctionReturn(0); 3744ca45077fSPaul Mullowney } 3745ca45077fSPaul Mullowney 3746a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3747a0e72f99SJunchao Zhang { 3748a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 3749a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3750a0e72f99SJunchao Zhang } 3751a0e72f99SJunchao Zhang 3752afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3753e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 37549ae82921SPaul Mullowney { 37559ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3756aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 37579ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3758e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3759e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3760e6e9a74fSStefano Zampini PetscBool compressed; 3761afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3762afb2bd1cSJunchao Zhang PetscInt nx,ny; 3763afb2bd1cSJunchao Zhang #endif 37646e111a19SKarl Rupp 37659ae82921SPaul Mullowney PetscFunctionBegin; 376608401ef6SPierre Jolivet PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3767cbc6b225SStefano Zampini if (!a->nz) { 37689566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 37699566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3770e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3771e6e9a74fSStefano Zampini } 377234d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 37739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3774e6e9a74fSStefano Zampini if (!trans) { 37759ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 37765f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3777e6e9a74fSStefano Zampini } else { 37781a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3779e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3780e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3781e6e9a74fSStefano Zampini } else { 37829566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3783e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3784e6e9a74fSStefano Zampini } 3785e6e9a74fSStefano Zampini } 3786e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3787e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3788213423ffSJunchao Zhang 3789e6e9a74fSStefano Zampini try { 37909566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 37919566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 37929566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3793afb2bd1cSJunchao Zhang 37949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3795e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3796afb2bd1cSJunchao Zhang /* z = A x + beta y. 3797afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3798afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3799afb2bd1cSJunchao Zhang */ 3800e6e9a74fSStefano Zampini xptr = xarray; 3801afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3802213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3803afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3804afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3805afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3806afb2bd1cSJunchao Zhang */ 3807afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3808afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3809afb2bd1cSJunchao Zhang nx = mat->num_cols; 3810afb2bd1cSJunchao Zhang ny = mat->num_rows; 3811afb2bd1cSJunchao Zhang } 3812afb2bd1cSJunchao Zhang #endif 3813e6e9a74fSStefano Zampini } else { 3814afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3815afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3816afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3817afb2bd1cSJunchao Zhang */ 3818afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3819e6e9a74fSStefano Zampini dptr = zarray; 3820e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3821afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3822e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3823a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3824e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3825e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3826e6e9a74fSStefano Zampini } 3827afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3828afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3829afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3830afb2bd1cSJunchao Zhang nx = mat->num_rows; 3831afb2bd1cSJunchao Zhang ny = mat->num_cols; 3832afb2bd1cSJunchao Zhang } 3833afb2bd1cSJunchao Zhang #endif 3834e6e9a74fSStefano Zampini } 38359ae82921SPaul Mullowney 3836afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3837aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3838afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 38395f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3840afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 38419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 38429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 38439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3844afb2bd1cSJunchao Zhang matstruct->matDescr, 3845afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3846afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3847afb2bd1cSJunchao Zhang cusparse_scalartype, 3848afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 38495f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 38509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3851afb2bd1cSJunchao Zhang 3852afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3853afb2bd1cSJunchao Zhang } else { 3854afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 38559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 38569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3857afb2bd1cSJunchao Zhang } 3858afb2bd1cSJunchao Zhang 38599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3860afb2bd1cSJunchao Zhang matstruct->alpha_one, 38613606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3862afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3863afb2bd1cSJunchao Zhang beta, 3864afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3865afb2bd1cSJunchao Zhang cusparse_scalartype, 3866afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 38675f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3868afb2bd1cSJunchao Zhang #else 38697656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 38709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3871a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3872afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3873aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3874e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 38755f80ce2aSJacob Faibussowitsch dptr)); 3876afb2bd1cSJunchao Zhang #endif 3877aa372e3fSPaul Mullowney } else { 3878213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3879afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3880afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3881afb2bd1cSJunchao Zhang #else 3882301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 38839566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3884afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3885e6e9a74fSStefano Zampini xptr, beta, 38865f80ce2aSJacob Faibussowitsch dptr)); 3887afb2bd1cSJunchao Zhang #endif 3888a65300a6SPaul Mullowney } 3889aa372e3fSPaul Mullowney } 38909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3891aa372e3fSPaul Mullowney 3892e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3893213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3894213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 38959566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3896e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 38979566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 38987656d835SStefano Zampini } 3899213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 39009566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz,0)); 39017656d835SStefano Zampini } 39027656d835SStefano Zampini 3903213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3904213423ffSJunchao Zhang if (compressed) { 39059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3906a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3907a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3908a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3909a0e72f99SJunchao Zhang */ 3910a0e72f99SJunchao Zhang #if 0 3911a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3912a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3913a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3914e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3915c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3916a0e72f99SJunchao Zhang #else 3917a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3918a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3919a0e72f99SJunchao Zhang #endif 39209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3921e6e9a74fSStefano Zampini } 3922e6e9a74fSStefano Zampini } else { 3923e6e9a74fSStefano Zampini if (yy && yy != zz) { 39249566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3925e6e9a74fSStefano Zampini } 3926e6e9a74fSStefano Zampini } 39279566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 39289566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 39299566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 39309ae82921SPaul Mullowney } catch(char *ex) { 393198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 39329ae82921SPaul Mullowney } 3933e6e9a74fSStefano Zampini if (yy) { 39349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3935e6e9a74fSStefano Zampini } else { 39369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3937e6e9a74fSStefano Zampini } 39389ae82921SPaul Mullowney PetscFunctionReturn(0); 39399ae82921SPaul Mullowney } 39409ae82921SPaul Mullowney 39416fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3942ca45077fSPaul Mullowney { 3943ca45077fSPaul Mullowney PetscFunctionBegin; 39449566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3945ca45077fSPaul Mullowney PetscFunctionReturn(0); 3946ca45077fSPaul Mullowney } 3947ca45077fSPaul Mullowney 39486fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 39499ae82921SPaul Mullowney { 3950042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3951042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 39523fa6b06aSMark Adams 3953042217e8SBarry Smith PetscFunctionBegin; 39549566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3955042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3956042217e8SBarry Smith 39579566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 39589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3959042217e8SBarry Smith cusp->deviceMat = NULL; 3960042217e8SBarry Smith } 39619ae82921SPaul Mullowney PetscFunctionReturn(0); 39629ae82921SPaul Mullowney } 39639ae82921SPaul Mullowney 39649ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3965e057df02SPaul Mullowney /*@ 39669ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3967e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3968e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3969e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3970e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3971e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 39729ae82921SPaul Mullowney 3973d083f849SBarry Smith Collective 39749ae82921SPaul Mullowney 39759ae82921SPaul Mullowney Input Parameters: 39769ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 39779ae82921SPaul Mullowney . m - number of rows 39789ae82921SPaul Mullowney . n - number of columns 39799ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 39809ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 39810298fd71SBarry Smith (possibly different for each row) or NULL 39829ae82921SPaul Mullowney 39839ae82921SPaul Mullowney Output Parameter: 39849ae82921SPaul Mullowney . A - the matrix 39859ae82921SPaul Mullowney 39869ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 39879ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 39889ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 39899ae82921SPaul Mullowney 39909ae82921SPaul Mullowney Notes: 39919ae82921SPaul Mullowney If nnz is given then nz is ignored 39929ae82921SPaul Mullowney 39939ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 39949ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 39959ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 39969ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 39979ae82921SPaul Mullowney 39989ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 39990298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 40009ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 40019ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 40029ae82921SPaul Mullowney 40039ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 40049ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 40059ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 40069ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 40079ae82921SPaul Mullowney 40089ae82921SPaul Mullowney Level: intermediate 40099ae82921SPaul Mullowney 4010db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 40119ae82921SPaul Mullowney @*/ 40129ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 40139ae82921SPaul Mullowney { 40149ae82921SPaul Mullowney PetscFunctionBegin; 40159566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 40169566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 40179566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 40189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 40199ae82921SPaul Mullowney PetscFunctionReturn(0); 40209ae82921SPaul Mullowney } 40219ae82921SPaul Mullowney 40226fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 40239ae82921SPaul Mullowney { 40249ae82921SPaul Mullowney PetscFunctionBegin; 40259ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 40269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 40279ae82921SPaul Mullowney } else { 40289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 4029aa372e3fSPaul Mullowney } 40309566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 40319566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 40329566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 40339566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 40349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 40359566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 40369566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 40379566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 40389566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 40399566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 40409566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 40419ae82921SPaul Mullowney PetscFunctionReturn(0); 40429ae82921SPaul Mullowney } 40439ae82921SPaul Mullowney 4044ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 404595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 40469ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 40479ff858a8SKarl Rupp { 40489ff858a8SKarl Rupp PetscFunctionBegin; 40499566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 40509566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 40519ff858a8SKarl Rupp PetscFunctionReturn(0); 40529ff858a8SKarl Rupp } 40539ff858a8SKarl Rupp 4054039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 405595639643SRichard Tran Mills { 4056a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4057039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 4058039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 4059039c6fbaSStefano Zampini PetscScalar *ay; 4060039c6fbaSStefano Zampini const PetscScalar *ax; 4061039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 4062e6e9a74fSStefano Zampini 406395639643SRichard Tran Mills PetscFunctionBegin; 4064a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4065a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4066039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 40679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 40689566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4069a587d139SMark PetscFunctionReturn(0); 407095639643SRichard Tran Mills } 4071039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 40729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 40739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 40745f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 40755f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4076039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 4077039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 4078039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 4079039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4080039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4081039c6fbaSStefano Zampini if (eq) { 4082039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4083039c6fbaSStefano Zampini } 4084039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 4085039c6fbaSStefano Zampini } 4086d2be01edSStefano Zampini /* spgeam is buggy with one column */ 4087d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4088039c6fbaSStefano Zampini 4089039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 4090039c6fbaSStefano Zampini PetscScalar b = 1.0; 4091039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4092039c6fbaSStefano Zampini size_t bufferSize; 4093039c6fbaSStefano Zampini void *buffer; 4094039c6fbaSStefano Zampini #endif 4095039c6fbaSStefano Zampini 40969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 40979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 40989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4099039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 41009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4101039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4102039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 41035f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 41049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 41059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4107039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4108039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 41095f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 41109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 41119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41129566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 4113039c6fbaSStefano Zampini #else 41149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4116039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4117039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 41185f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 41199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 41209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4121039c6fbaSStefano Zampini #endif 41229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 41239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 41249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 41259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4126039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 4127a587d139SMark cublasHandle_t cublasv2handle; 4128a587d139SMark PetscBLASInt one = 1, bnz = 1; 4129039c6fbaSStefano Zampini 41309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 41319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 41329566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 41339566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz,&bnz)); 41349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41359566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 41369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*bnz)); 41379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 41399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 41409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4141039c6fbaSStefano Zampini } else { 41429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 41439566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4144a587d139SMark } 414595639643SRichard Tran Mills PetscFunctionReturn(0); 414695639643SRichard Tran Mills } 414795639643SRichard Tran Mills 414833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 414933c9ba73SStefano Zampini { 415033c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 415133c9ba73SStefano Zampini PetscScalar *ay; 415233c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 415333c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 415433c9ba73SStefano Zampini 415533c9ba73SStefano Zampini PetscFunctionBegin; 41569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 41579566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 41589566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz,&bnz)); 41599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41609566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 41619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 41629566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 41649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 416533c9ba73SStefano Zampini PetscFunctionReturn(0); 416633c9ba73SStefano Zampini } 416733c9ba73SStefano Zampini 41683fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 41693fa6b06aSMark Adams { 41707e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 4171a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41727e8381f9SStefano Zampini 41733fa6b06aSMark Adams PetscFunctionBegin; 41743fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 41753fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 41767e8381f9SStefano Zampini if (spptr->mat) { 41777e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 41787e8381f9SStefano Zampini if (matrix->values) { 41797e8381f9SStefano Zampini both = PETSC_TRUE; 41807e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 41817e8381f9SStefano Zampini } 41827e8381f9SStefano Zampini } 41837e8381f9SStefano Zampini if (spptr->matTranspose) { 41847e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 41857e8381f9SStefano Zampini if (matrix->values) { 41867e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 41877e8381f9SStefano Zampini } 41887e8381f9SStefano Zampini } 41893fa6b06aSMark Adams } 41909566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 41919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 41927e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4193a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 41943fa6b06aSMark Adams PetscFunctionReturn(0); 41953fa6b06aSMark Adams } 41963fa6b06aSMark Adams 4197a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4198a587d139SMark { 4199a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4200a587d139SMark 4201a587d139SMark PetscFunctionBegin; 42029a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 42039a14fc28SStefano Zampini A->boundtocpu = flg; 42049a14fc28SStefano Zampini PetscFunctionReturn(0); 42059a14fc28SStefano Zampini } 4206a587d139SMark if (flg) { 42079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4208a587d139SMark 420933c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 4210a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 4211a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4212a587d139SMark A->ops->mult = MatMult_SeqAIJ; 4213a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 4214a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4215a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4216a587d139SMark A->ops->multhermitiantranspose = NULL; 4217a587d139SMark A->ops->multhermitiantransposeadd = NULL; 4218fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 42199566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 42209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 42219566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 42229566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 42239566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 42249566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 42259566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4226a587d139SMark } else { 422733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 4228a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4229a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4230a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 4231a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4232a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4233a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4234a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4235a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4236fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 423767a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 423867a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 423967a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 424067a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 424167a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 424267a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 42437ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 42447ee59b9bSJunchao Zhang 42459566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 42469566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 42479566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 42489566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 42499566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 42509566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4251a587d139SMark } 4252a587d139SMark A->boundtocpu = flg; 4253ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 4254ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 4255ea500dcfSRichard Tran Mills } else { 4256ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 4257ea500dcfSRichard Tran Mills } 4258a587d139SMark PetscFunctionReturn(0); 4259a587d139SMark } 4260a587d139SMark 426149735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 42629ae82921SPaul Mullowney { 426349735bf3SStefano Zampini Mat B; 42649ae82921SPaul Mullowney 42659ae82921SPaul Mullowney PetscFunctionBegin; 42669566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 426749735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 42689566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 426949735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 42709566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 427149735bf3SStefano Zampini } 427249735bf3SStefano Zampini B = *newmat; 427349735bf3SStefano Zampini 42749566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 42759566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 427634136279SStefano Zampini 427749735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 42789ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4279e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 42809566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 42819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 42829566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 42831a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4284d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4285ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 4286a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4287a435da06SStefano Zampini #else 4288d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4289a435da06SStefano Zampini #endif 4290d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4291d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4292d8132acaSStefano Zampini #endif 42931a2c6b5cSJunchao Zhang B->spptr = spptr; 42949ae82921SPaul Mullowney } else { 4295e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4296e6e9a74fSStefano Zampini 42979566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 42989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 42999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4300e6e9a74fSStefano Zampini B->spptr = spptr; 43019ae82921SPaul Mullowney } 4302e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 430349735bf3SStefano Zampini } 4304693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 43059ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 43061a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 43079ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 430895639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4309693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 43102205254eSKarl Rupp 43119566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 43129566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 43139566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4314ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 43159566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4316ae48a8d0SStefano Zampini #endif 43179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 43189ae82921SPaul Mullowney PetscFunctionReturn(0); 43199ae82921SPaul Mullowney } 43209ae82921SPaul Mullowney 432102fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 432202fe1965SBarry Smith { 432302fe1965SBarry Smith PetscFunctionBegin; 43249566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 43259566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 432602fe1965SBarry Smith PetscFunctionReturn(0); 432702fe1965SBarry Smith } 432802fe1965SBarry Smith 43293ca39a21SBarry Smith /*MC 4330e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4331e057df02SPaul Mullowney 4332e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 43332692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 43342692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4335e057df02SPaul Mullowney 4336e057df02SPaul Mullowney Options Database Keys: 4337e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4338aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4339a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4340365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4341e057df02SPaul Mullowney 4342e057df02SPaul Mullowney Level: beginner 4343e057df02SPaul Mullowney 4344db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4345e057df02SPaul Mullowney M*/ 43467f756511SDominic Meiser 4347bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 43480f39cd5aSBarry Smith 43493ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 435042c9c57cSBarry Smith { 435142c9c57cSBarry Smith PetscFunctionBegin; 43529566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 43539566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 43549566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 43559566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 43569566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4357bddcd29dSMark Adams 435842c9c57cSBarry Smith PetscFunctionReturn(0); 435942c9c57cSBarry Smith } 436029b38603SBarry Smith 4361cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4362cbc6b225SStefano Zampini { 4363cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4364cbc6b225SStefano Zampini 4365cbc6b225SStefano Zampini PetscFunctionBegin; 4366cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4367cbc6b225SStefano Zampini delete cusp->cooPerm; 4368cbc6b225SStefano Zampini delete cusp->cooPerm_a; 4369cbc6b225SStefano Zampini cusp->cooPerm = NULL; 4370cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 4371cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 43729566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 43739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 4374cbc6b225SStefano Zampini } 4375cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 4376cbc6b225SStefano Zampini PetscFunctionReturn(0); 4377cbc6b225SStefano Zampini } 4378cbc6b225SStefano Zampini 4379470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 43807f756511SDominic Meiser { 43817f756511SDominic Meiser PetscFunctionBegin; 43827f756511SDominic Meiser if (*cusparsestruct) { 43839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 43849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 43857f756511SDominic Meiser delete (*cusparsestruct)->workVector; 438681902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 43877e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 43887e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 4389a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 43909566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 43919566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 43929566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 43939566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 43947f756511SDominic Meiser } 43957f756511SDominic Meiser PetscFunctionReturn(0); 43967f756511SDominic Meiser } 43977f756511SDominic Meiser 43987f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 43997f756511SDominic Meiser { 44007f756511SDominic Meiser PetscFunctionBegin; 44017f756511SDominic Meiser if (*mat) { 44027f756511SDominic Meiser delete (*mat)->values; 44037f756511SDominic Meiser delete (*mat)->column_indices; 44047f756511SDominic Meiser delete (*mat)->row_offsets; 44057f756511SDominic Meiser delete *mat; 44067f756511SDominic Meiser *mat = 0; 44077f756511SDominic Meiser } 44087f756511SDominic Meiser PetscFunctionReturn(0); 44097f756511SDominic Meiser } 44107f756511SDominic Meiser 4411470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 44127f756511SDominic Meiser { 44137f756511SDominic Meiser PetscFunctionBegin; 44147f756511SDominic Meiser if (*trifactor) { 44159566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4416261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 44179566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 44189566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 44199566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4420afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 44219566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4422afb2bd1cSJunchao Zhang #endif 44239566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 44247f756511SDominic Meiser } 44257f756511SDominic Meiser PetscFunctionReturn(0); 44267f756511SDominic Meiser } 44277f756511SDominic Meiser 4428470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 44297f756511SDominic Meiser { 44307f756511SDominic Meiser CsrMatrix *mat; 44317f756511SDominic Meiser 44327f756511SDominic Meiser PetscFunctionBegin; 44337f756511SDominic Meiser if (*matstruct) { 44347f756511SDominic Meiser if ((*matstruct)->mat) { 44357f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4436afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4437afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4438afb2bd1cSJunchao Zhang #else 44397f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 44409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4441afb2bd1cSJunchao Zhang #endif 44427f756511SDominic Meiser } else { 44437f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 44447f756511SDominic Meiser CsrMatrix_Destroy(&mat); 44457f756511SDominic Meiser } 44467f756511SDominic Meiser } 44479566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 44487f756511SDominic Meiser delete (*matstruct)->cprowIndices; 44499566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 44509566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 44519566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4452afb2bd1cSJunchao Zhang 4453afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4454afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 44559566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4456afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 4457afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 44589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 44599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 44609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4461afb2bd1cSJunchao Zhang } 4462afb2bd1cSJunchao Zhang } 4463afb2bd1cSJunchao Zhang #endif 44647f756511SDominic Meiser delete *matstruct; 44657e8381f9SStefano Zampini *matstruct = NULL; 44667f756511SDominic Meiser } 44677f756511SDominic Meiser PetscFunctionReturn(0); 44687f756511SDominic Meiser } 44697f756511SDominic Meiser 4470e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 44717f756511SDominic Meiser { 4472da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4473da112707SJunchao Zhang 44747f756511SDominic Meiser PetscFunctionBegin; 4475da112707SJunchao Zhang if (fs) { 4476da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4477da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4478da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4479da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4480da112707SJunchao Zhang delete fs->rpermIndices; 4481da112707SJunchao Zhang delete fs->cpermIndices; 4482da112707SJunchao Zhang delete fs->workVector; 4483da112707SJunchao Zhang fs->rpermIndices = NULL; 4484da112707SJunchao Zhang fs->cpermIndices = NULL; 4485da112707SJunchao Zhang fs->workVector = NULL; 4486da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4487da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4488da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4489da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 4490da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4491da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 4492da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4493da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4494da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 4495*12ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4496da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4497da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4498*12ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4499da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4500da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4501da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4502da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4503da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4504da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4505da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4506da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4507da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4508da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4509da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4510da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4511*12ba2bc6SJunchao Zhang 4512*12ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 4513*12ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4514da112707SJunchao Zhang #endif 4515ccdfe979SStefano Zampini } 4516ccdfe979SStefano Zampini PetscFunctionReturn(0); 4517ccdfe979SStefano Zampini } 4518ccdfe979SStefano Zampini 4519ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4520ccdfe979SStefano Zampini { 4521ccdfe979SStefano Zampini cusparseHandle_t handle; 4522ccdfe979SStefano Zampini 4523ccdfe979SStefano Zampini PetscFunctionBegin; 4524ccdfe979SStefano Zampini if (*trifactors) { 45259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 45267f756511SDominic Meiser if (handle = (*trifactors)->handle) { 45279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroy(handle)); 45287f756511SDominic Meiser } 45299566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 45307f756511SDominic Meiser } 45317f756511SDominic Meiser PetscFunctionReturn(0); 45327f756511SDominic Meiser } 45337e8381f9SStefano Zampini 45347e8381f9SStefano Zampini struct IJCompare 45357e8381f9SStefano Zampini { 45367e8381f9SStefano Zampini __host__ __device__ 45377e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 45387e8381f9SStefano Zampini { 45397e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 45407e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 45417e8381f9SStefano Zampini return false; 45427e8381f9SStefano Zampini } 45437e8381f9SStefano Zampini }; 45447e8381f9SStefano Zampini 45457e8381f9SStefano Zampini struct IJEqual 45467e8381f9SStefano Zampini { 45477e8381f9SStefano Zampini __host__ __device__ 45487e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 45497e8381f9SStefano Zampini { 45507e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 45517e8381f9SStefano Zampini return true; 45527e8381f9SStefano Zampini } 45537e8381f9SStefano Zampini }; 45547e8381f9SStefano Zampini 45557e8381f9SStefano Zampini struct IJDiff 45567e8381f9SStefano Zampini { 45577e8381f9SStefano Zampini __host__ __device__ 45587e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 45597e8381f9SStefano Zampini { 45607e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 45617e8381f9SStefano Zampini } 45627e8381f9SStefano Zampini }; 45637e8381f9SStefano Zampini 45647e8381f9SStefano Zampini struct IJSum 45657e8381f9SStefano Zampini { 45667e8381f9SStefano Zampini __host__ __device__ 45677e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 45687e8381f9SStefano Zampini { 45697e8381f9SStefano Zampini return t1||t2; 45707e8381f9SStefano Zampini } 45717e8381f9SStefano Zampini }; 45727e8381f9SStefano Zampini 45737e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 4574219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4575219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 45767e8381f9SStefano Zampini { 45777e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4578fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4579bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 458008391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 45817e8381f9SStefano Zampini CsrMatrix *matrix; 45827e8381f9SStefano Zampini PetscInt n; 45837e8381f9SStefano Zampini 45847e8381f9SStefano Zampini PetscFunctionBegin; 458528b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 458628b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 45877e8381f9SStefano Zampini if (!cusp->cooPerm) { 45889566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 45899566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 45907e8381f9SStefano Zampini PetscFunctionReturn(0); 45917e8381f9SStefano Zampini } 45927e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 459328b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4594e61fc153SStefano Zampini if (!v) { 4595e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4596e61fc153SStefano Zampini goto finalize; 45977e8381f9SStefano Zampini } 4598e61fc153SStefano Zampini n = cusp->cooPerm->size(); 459908391a17SStefano Zampini if (isCudaMem(v)) { 460008391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 460108391a17SStefano Zampini } else { 4602e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4603e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 460408391a17SStefano Zampini d_v = cooPerm_v->data(); 46059566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 460608391a17SStefano Zampini } 46079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4608e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4609ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4610bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 461108391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4612ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4613ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4614ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4615ddea5d60SJunchao Zhang */ 4616e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4617e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4618e61fc153SStefano Zampini delete cooPerm_w; 46197e8381f9SStefano Zampini } else { 4620ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 462108391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 46227e8381f9SStefano Zampini matrix->values->begin())); 462308391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 46247e8381f9SStefano Zampini matrix->values->end())); 4625ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 46267e8381f9SStefano Zampini } 46277e8381f9SStefano Zampini } else { 4628e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 462908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4630e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 46317e8381f9SStefano Zampini } else { 463208391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 46337e8381f9SStefano Zampini matrix->values->begin())); 463408391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 46357e8381f9SStefano Zampini matrix->values->end())); 46367e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 46377e8381f9SStefano Zampini } 46387e8381f9SStefano Zampini } 46399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4640e61fc153SStefano Zampini finalize: 4641e61fc153SStefano Zampini delete cooPerm_v; 46427e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46439566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4644fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 46459566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 46469566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 46479566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4648fcdce8c4SStefano Zampini a->reallocs = 0; 4649fcdce8c4SStefano Zampini A->info.mallocs += 0; 4650fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4651fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4652fcdce8c4SStefano Zampini A->num_ass++; 46537e8381f9SStefano Zampini PetscFunctionReturn(0); 46547e8381f9SStefano Zampini } 46557e8381f9SStefano Zampini 4656a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4657a49f1ed0SStefano Zampini { 4658a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4659a49f1ed0SStefano Zampini 4660a49f1ed0SStefano Zampini PetscFunctionBegin; 4661a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4662a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4663a49f1ed0SStefano Zampini if (destroy) { 46649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4665a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4666a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4667a49f1ed0SStefano Zampini } 46681a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 4669a49f1ed0SStefano Zampini PetscFunctionReturn(0); 4670a49f1ed0SStefano Zampini } 4671a49f1ed0SStefano Zampini 46727e8381f9SStefano Zampini #include <thrust/binary_search.h> 4673219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4674219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 46757e8381f9SStefano Zampini { 46767e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 46777e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 46787e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 46797e8381f9SStefano Zampini 46807e8381f9SStefano Zampini PetscFunctionBegin; 46819566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 46829566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 46837e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 46847e8381f9SStefano Zampini if (n != cooPerm_n) { 46857e8381f9SStefano Zampini delete cusp->cooPerm; 46867e8381f9SStefano Zampini delete cusp->cooPerm_a; 46877e8381f9SStefano Zampini cusp->cooPerm = NULL; 46887e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 46897e8381f9SStefano Zampini } 46907e8381f9SStefano Zampini if (n) { 46917e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 46927e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 46937e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 46947e8381f9SStefano Zampini 46957e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 46967e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 46977e8381f9SStefano Zampini 46989566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 46997e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 47007e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 4701ddea5d60SJunchao Zhang 4702ddea5d60SJunchao Zhang /* Ex. 4703ddea5d60SJunchao Zhang n = 6 4704ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4705ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4706ddea5d60SJunchao Zhang */ 47077e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 47087e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 47097e8381f9SStefano Zampini 47109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 47117e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4712ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4713ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 47147e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 47157e8381f9SStefano Zampini 4716ddea5d60SJunchao Zhang /* 4717ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4718ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4719ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4720ddea5d60SJunchao Zhang */ 4721ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4722ddea5d60SJunchao Zhang 4723ddea5d60SJunchao Zhang /* 4724ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4725ddea5d60SJunchao Zhang ^ekey 4726ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4727ddea5d60SJunchao Zhang ^nekye 4728ddea5d60SJunchao Zhang */ 47297e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 47307e8381f9SStefano Zampini delete cusp->cooPerm_a; 47317e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4732ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4733ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4734ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4735ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4736ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 47377e8381f9SStefano Zampini w[0] = 0; 4738ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4739ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 47407e8381f9SStefano Zampini } 47417e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4742ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4743ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4744ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 47459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 47467e8381f9SStefano Zampini 47479566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 47487e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 47497e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 47507e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 47519566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4752ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 47539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 47547e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4755fcdce8c4SStefano Zampini a->rmax = 0; 47569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->a)); 47579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->j)); 47589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 47599566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 47609566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 47617e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 47627e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 47637e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 47647e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4765fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 47667e8381f9SStefano Zampini } 4767fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 47687e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 47699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 47709566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 47717e8381f9SStefano Zampini } else { 47729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 47737e8381f9SStefano Zampini } 47749566063dSJacob Faibussowitsch PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 47757e8381f9SStefano Zampini 47767e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4777e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 47789566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->nz)); 47799566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 47807e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 47819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 47837e8381f9SStefano Zampini PetscFunctionReturn(0); 47847e8381f9SStefano Zampini } 4785ed502f03SStefano Zampini 4786219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4787219fbbafSJunchao Zhang { 4788219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4789219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4790cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4791219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4792219fbbafSJunchao Zhang 4793219fbbafSJunchao Zhang PetscFunctionBegin; 47949566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 47959566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4796219fbbafSJunchao Zhang if (coo_i) { 47979566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 4798219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4799219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4800cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4801219fbbafSJunchao Zhang } 4802219fbbafSJunchao Zhang } 4803219fbbafSJunchao Zhang } 4804219fbbafSJunchao Zhang 4805219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 48069566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4807219fbbafSJunchao Zhang } else { 48089566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4809cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 48109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4811219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4812219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 48139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 48149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 48159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 48169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4817219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4818219fbbafSJunchao Zhang } 4819219fbbafSJunchao Zhang PetscFunctionReturn(0); 4820219fbbafSJunchao Zhang } 4821219fbbafSJunchao Zhang 482277804d84SJunchao Zhang __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4823219fbbafSJunchao Zhang { 4824219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4825219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4826b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4827b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4828b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4829b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4830b6c38306SJunchao Zhang } 4831219fbbafSJunchao Zhang } 4832219fbbafSJunchao Zhang 4833219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4834219fbbafSJunchao Zhang { 4835219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4836219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4837219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4838219fbbafSJunchao Zhang PetscMemType memtype; 4839219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4840219fbbafSJunchao Zhang PetscScalar *Aa; 4841219fbbafSJunchao Zhang 4842219fbbafSJunchao Zhang PetscFunctionBegin; 4843219fbbafSJunchao Zhang if (dev->use_extended_coo) { 48449566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 4845219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 48469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 48479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4848219fbbafSJunchao Zhang } 4849219fbbafSJunchao Zhang 48509566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 48519566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4852219fbbafSJunchao Zhang 4853cbc6b225SStefano Zampini if (Annz) { 4854b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 48559566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4856cbc6b225SStefano Zampini } 4857219fbbafSJunchao Zhang 48589566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 48599566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4860219fbbafSJunchao Zhang 48619566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4862219fbbafSJunchao Zhang } else { 48639566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4864219fbbafSJunchao Zhang } 4865219fbbafSJunchao Zhang PetscFunctionReturn(0); 4866219fbbafSJunchao Zhang } 4867219fbbafSJunchao Zhang 48685b7e41feSStefano Zampini /*@C 48695b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 48705b7e41feSStefano Zampini 48715b7e41feSStefano Zampini Not collective 48725b7e41feSStefano Zampini 48735b7e41feSStefano Zampini Input Parameters: 48745b7e41feSStefano Zampini + A - the matrix 48755b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 48765b7e41feSStefano Zampini 48775b7e41feSStefano Zampini Output Parameters: 48785b7e41feSStefano Zampini + ia - the CSR row pointers 48795b7e41feSStefano Zampini - ja - the CSR column indices 48805b7e41feSStefano Zampini 48815b7e41feSStefano Zampini Level: developer 48825b7e41feSStefano Zampini 48835b7e41feSStefano Zampini Notes: 48845b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 48855b7e41feSStefano Zampini 4886db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 48875b7e41feSStefano Zampini @*/ 48885f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 48895f101d05SStefano Zampini { 48905f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 48915f101d05SStefano Zampini CsrMatrix *csr; 48925f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 48935f101d05SStefano Zampini 48945f101d05SStefano Zampini PetscFunctionBegin; 48955f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 48965f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 48975f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4898aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 48999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 490028b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 49015f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 49025f101d05SStefano Zampini if (i) { 49035f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 49045f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 49055f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 49065f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 49079566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 49085f101d05SStefano Zampini } 49095f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 49105f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 49115f101d05SStefano Zampini } 49125f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 49135f101d05SStefano Zampini PetscFunctionReturn(0); 49145f101d05SStefano Zampini } 49155f101d05SStefano Zampini 49165b7e41feSStefano Zampini /*@C 49175b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 49185b7e41feSStefano Zampini 49195b7e41feSStefano Zampini Not collective 49205b7e41feSStefano Zampini 49215b7e41feSStefano Zampini Input Parameters: 49225b7e41feSStefano Zampini + A - the matrix 49235b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 49245b7e41feSStefano Zampini 49255b7e41feSStefano Zampini Output Parameters: 49265b7e41feSStefano Zampini + ia - the CSR row pointers 49275b7e41feSStefano Zampini - ja - the CSR column indices 49285b7e41feSStefano Zampini 49295b7e41feSStefano Zampini Level: developer 49305b7e41feSStefano Zampini 4931db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 49325b7e41feSStefano Zampini @*/ 49335f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 49345f101d05SStefano Zampini { 49355f101d05SStefano Zampini PetscFunctionBegin; 49365f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 49375f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 49385f101d05SStefano Zampini if (i) *i = NULL; 49395f101d05SStefano Zampini if (j) *j = NULL; 49405f101d05SStefano Zampini PetscFunctionReturn(0); 49415f101d05SStefano Zampini } 49425f101d05SStefano Zampini 49435b7e41feSStefano Zampini /*@C 49445b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 49455b7e41feSStefano Zampini 49465b7e41feSStefano Zampini Not Collective 49475b7e41feSStefano Zampini 49485b7e41feSStefano Zampini Input Parameter: 49495b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 49505b7e41feSStefano Zampini 49515b7e41feSStefano Zampini Output Parameter: 49525b7e41feSStefano Zampini . a - pointer to the device data 49535b7e41feSStefano Zampini 49545b7e41feSStefano Zampini Level: developer 49555b7e41feSStefano Zampini 49565b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 49575b7e41feSStefano Zampini 4958db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 49595b7e41feSStefano Zampini @*/ 4960ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4961ed502f03SStefano Zampini { 4962ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4963ed502f03SStefano Zampini CsrMatrix *csr; 4964ed502f03SStefano Zampini 4965ed502f03SStefano Zampini PetscFunctionBegin; 4966ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4967ed502f03SStefano Zampini PetscValidPointer(a,2); 4968ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4969aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 49709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 497128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4972ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 497328b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4974ed502f03SStefano Zampini *a = csr->values->data().get(); 4975ed502f03SStefano Zampini PetscFunctionReturn(0); 4976ed502f03SStefano Zampini } 4977ed502f03SStefano Zampini 49785b7e41feSStefano Zampini /*@C 49795b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 49805b7e41feSStefano Zampini 49815b7e41feSStefano Zampini Not Collective 49825b7e41feSStefano Zampini 49835b7e41feSStefano Zampini Input Parameter: 49845b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 49855b7e41feSStefano Zampini 49865b7e41feSStefano Zampini Output Parameter: 49875b7e41feSStefano Zampini . a - pointer to the device data 49885b7e41feSStefano Zampini 49895b7e41feSStefano Zampini Level: developer 49905b7e41feSStefano Zampini 4991db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 49925b7e41feSStefano Zampini @*/ 4993ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4994ed502f03SStefano Zampini { 4995ed502f03SStefano Zampini PetscFunctionBegin; 4996ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4997ed502f03SStefano Zampini PetscValidPointer(a,2); 4998ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4999ed502f03SStefano Zampini *a = NULL; 5000ed502f03SStefano Zampini PetscFunctionReturn(0); 5001ed502f03SStefano Zampini } 5002ed502f03SStefano Zampini 50035b7e41feSStefano Zampini /*@C 50045b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 50055b7e41feSStefano Zampini 50065b7e41feSStefano Zampini Not Collective 50075b7e41feSStefano Zampini 50085b7e41feSStefano Zampini Input Parameter: 50095b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50105b7e41feSStefano Zampini 50115b7e41feSStefano Zampini Output Parameter: 50125b7e41feSStefano Zampini . a - pointer to the device data 50135b7e41feSStefano Zampini 50145b7e41feSStefano Zampini Level: developer 50155b7e41feSStefano Zampini 50165b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 50175b7e41feSStefano Zampini 5018db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 50195b7e41feSStefano Zampini @*/ 5020039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 5021039c6fbaSStefano Zampini { 5022039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5023039c6fbaSStefano Zampini CsrMatrix *csr; 5024039c6fbaSStefano Zampini 5025039c6fbaSStefano Zampini PetscFunctionBegin; 5026039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5027039c6fbaSStefano Zampini PetscValidPointer(a,2); 5028039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5029aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 50309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 503128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5032039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 503328b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5034039c6fbaSStefano Zampini *a = csr->values->data().get(); 5035039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 50369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5037039c6fbaSStefano Zampini PetscFunctionReturn(0); 5038039c6fbaSStefano Zampini } 50395b7e41feSStefano Zampini /*@C 50405b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5041039c6fbaSStefano Zampini 50425b7e41feSStefano Zampini Not Collective 50435b7e41feSStefano Zampini 50445b7e41feSStefano Zampini Input Parameter: 50455b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50465b7e41feSStefano Zampini 50475b7e41feSStefano Zampini Output Parameter: 50485b7e41feSStefano Zampini . a - pointer to the device data 50495b7e41feSStefano Zampini 50505b7e41feSStefano Zampini Level: developer 50515b7e41feSStefano Zampini 5052db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 50535b7e41feSStefano Zampini @*/ 5054039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5055039c6fbaSStefano Zampini { 5056039c6fbaSStefano Zampini PetscFunctionBegin; 5057039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5058039c6fbaSStefano Zampini PetscValidPointer(a,2); 5059039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 50609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 50619566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5062039c6fbaSStefano Zampini *a = NULL; 5063039c6fbaSStefano Zampini PetscFunctionReturn(0); 5064039c6fbaSStefano Zampini } 5065039c6fbaSStefano Zampini 50665b7e41feSStefano Zampini /*@C 50675b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 50685b7e41feSStefano Zampini 50695b7e41feSStefano Zampini Not Collective 50705b7e41feSStefano Zampini 50715b7e41feSStefano Zampini Input Parameter: 50725b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50735b7e41feSStefano Zampini 50745b7e41feSStefano Zampini Output Parameter: 50755b7e41feSStefano Zampini . a - pointer to the device data 50765b7e41feSStefano Zampini 50775b7e41feSStefano Zampini Level: developer 50785b7e41feSStefano Zampini 50795b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 50805b7e41feSStefano Zampini 5081db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 50825b7e41feSStefano Zampini @*/ 5083ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5084ed502f03SStefano Zampini { 5085ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5086ed502f03SStefano Zampini CsrMatrix *csr; 5087ed502f03SStefano Zampini 5088ed502f03SStefano Zampini PetscFunctionBegin; 5089ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5090ed502f03SStefano Zampini PetscValidPointer(a,2); 5091ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5092aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 509328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5094ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 509528b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5096ed502f03SStefano Zampini *a = csr->values->data().get(); 5097039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 50989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5099ed502f03SStefano Zampini PetscFunctionReturn(0); 5100ed502f03SStefano Zampini } 5101ed502f03SStefano Zampini 51025b7e41feSStefano Zampini /*@C 51035b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 51045b7e41feSStefano Zampini 51055b7e41feSStefano Zampini Not Collective 51065b7e41feSStefano Zampini 51075b7e41feSStefano Zampini Input Parameter: 51085b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 51095b7e41feSStefano Zampini 51105b7e41feSStefano Zampini Output Parameter: 51115b7e41feSStefano Zampini . a - pointer to the device data 51125b7e41feSStefano Zampini 51135b7e41feSStefano Zampini Level: developer 51145b7e41feSStefano Zampini 5115db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 51165b7e41feSStefano Zampini @*/ 5117ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5118ed502f03SStefano Zampini { 5119ed502f03SStefano Zampini PetscFunctionBegin; 5120ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5121ed502f03SStefano Zampini PetscValidPointer(a,2); 5122ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 51239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 51249566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5125ed502f03SStefano Zampini *a = NULL; 5126ed502f03SStefano Zampini PetscFunctionReturn(0); 5127ed502f03SStefano Zampini } 5128ed502f03SStefano Zampini 5129ed502f03SStefano Zampini struct IJCompare4 5130ed502f03SStefano Zampini { 5131ed502f03SStefano Zampini __host__ __device__ 51322ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5133ed502f03SStefano Zampini { 5134ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 5135ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5136ed502f03SStefano Zampini return false; 5137ed502f03SStefano Zampini } 5138ed502f03SStefano Zampini }; 5139ed502f03SStefano Zampini 51408909a122SStefano Zampini struct Shift 51418909a122SStefano Zampini { 5142ed502f03SStefano Zampini int _shift; 5143ed502f03SStefano Zampini 5144ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 5145ed502f03SStefano Zampini __host__ __device__ 5146ed502f03SStefano Zampini inline int operator() (const int &c) 5147ed502f03SStefano Zampini { 5148ed502f03SStefano Zampini return c + _shift; 5149ed502f03SStefano Zampini } 5150ed502f03SStefano Zampini }; 5151ed502f03SStefano Zampini 5152ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5153ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5154ed502f03SStefano Zampini { 5155ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5156ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5157ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5158ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 5159ed502f03SStefano Zampini PetscInt Annz,Bnnz; 5160ed502f03SStefano Zampini cusparseStatus_t stat; 5161ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 5162ed502f03SStefano Zampini 5163ed502f03SStefano Zampini PetscFunctionBegin; 5164ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5165ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5166ed502f03SStefano Zampini PetscValidPointer(C,4); 5167ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5168ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 51695f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 517008401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5171aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5172aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5173ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 5174ed502f03SStefano Zampini m = A->rmap->n; 5175ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 51769566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,C)); 51779566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C,m,n,m,n)); 51789566063dSJacob Faibussowitsch PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5179ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 5180ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5181ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5182ed502f03SStefano Zampini Ccsr = new CsrMatrix; 5183ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 5184ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 5185ed502f03SStefano Zampini c->compressedrow.nrows = 0; 5186ed502f03SStefano Zampini c->compressedrow.i = NULL; 5187ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 5188ed502f03SStefano Zampini Ccusp->workVector = NULL; 5189ed502f03SStefano Zampini Ccusp->nrows = m; 5190ed502f03SStefano Zampini Ccusp->mat = Cmat; 5191ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 5192ed502f03SStefano Zampini Ccsr->num_rows = m; 5193ed502f03SStefano Zampini Ccsr->num_cols = n; 51949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 51959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 51969566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 51979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 51989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 51999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 52009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 52019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 52029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 52039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 52049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 520528b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 520628b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5207ed502f03SStefano Zampini 5208ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 5209ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5210ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 5211ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 5212ed502f03SStefano Zampini c->nz = Annz + Bnnz; 5213ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5214ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5215ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 5216ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 5217ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5218ed502f03SStefano Zampini if (c->nz) { 52192ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 52202ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 52212ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 52222ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 52232ed87e7eSStefano Zampini 5224ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 5225ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 5226ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5227ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 52289566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5229ed502f03SStefano Zampini } 52302ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 52312ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 5232ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 5233ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 5234ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5235ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 52369566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5237ed502f03SStefano Zampini } 52382ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 52392ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 52409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 52412ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 52422ed87e7eSStefano Zampini Aroff->data().get(), 52432ed87e7eSStefano Zampini Annz, 52442ed87e7eSStefano Zampini m, 52452ed87e7eSStefano Zampini Acoo->data().get(), 52469566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5247ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 52482ed87e7eSStefano Zampini Broff->data().get(), 5249ed502f03SStefano Zampini Bnnz, 5250ed502f03SStefano Zampini m, 52512ed87e7eSStefano Zampini Bcoo->data().get(), 52529566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 52532ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 52542ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 52552ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 52568909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5257ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5258ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 52598909a122SStefano Zampini #else 52608909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 52618909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 52628909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 52638909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 52648909a122SStefano Zampini #endif 52652ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 52662ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 52672ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 52682ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 52692ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 52702ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5271ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 5272ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 5273ed502f03SStefano Zampini thrust::advance(p2,Annz); 52742ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 52758909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 52768909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 52778909a122SStefano Zampini #endif 52782ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 52792ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 52802ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 52812ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 52822ed87e7eSStefano Zampini #else 52832ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 52842ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 52852ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 52862ed87e7eSStefano Zampini #endif 5287ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 52882ed87e7eSStefano Zampini Ccoo->data().get(), 5289ed502f03SStefano Zampini c->nz, 5290ed502f03SStefano Zampini m, 5291ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 52929566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 52939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 52942ed87e7eSStefano Zampini delete wPerm; 52952ed87e7eSStefano Zampini delete Acoo; 52962ed87e7eSStefano Zampini delete Bcoo; 52972ed87e7eSStefano Zampini delete Ccoo; 5298ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5299ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5300ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5301ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 53029566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5303ed502f03SStefano Zampini #endif 53041a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 53059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 53069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5307ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5308ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5309ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 5310ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5311ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5312ed502f03SStefano Zampini 53131a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 53141a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5315a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 5316ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 5317ed502f03SStefano Zampini CmatT->mat = CcsrT; 5318ed502f03SStefano Zampini CcsrT->num_rows = n; 5319ed502f03SStefano Zampini CcsrT->num_cols = m; 5320ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 5321ed502f03SStefano Zampini 5322ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5323ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5324ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 5325ed502f03SStefano Zampini 53269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5327ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 5328ed502f03SStefano Zampini if (AT) { 5329ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5330ed502f03SStefano Zampini thrust::advance(rT,-1); 5331ed502f03SStefano Zampini } 5332ed502f03SStefano Zampini if (BT) { 5333ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5334ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5335ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 5336ed502f03SStefano Zampini } 5337ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 5338ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5339ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5340ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5341ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5342ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 53439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5344ed502f03SStefano Zampini 53459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 53469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 53479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 53489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 53499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 53509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 53519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 53529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 53539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5354ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5355ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5356ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5357ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 53589566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5359ed502f03SStefano Zampini #endif 5360ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 5361ed502f03SStefano Zampini } 5362ed502f03SStefano Zampini } 5363ed502f03SStefano Zampini 5364ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 5365ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 5366ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 53679566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 53689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 5369ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5370ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5371ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5372ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 5373ed502f03SStefano Zampini jj = *Ccsr->column_indices; 53749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 53759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5376ed502f03SStefano Zampini } else { 53779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 53789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5379ed502f03SStefano Zampini } 53809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 53819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 53829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 5383ed502f03SStefano Zampini c->maxnz = c->nz; 5384ed502f03SStefano Zampini c->nonzerorowcnt = 0; 5385ed502f03SStefano Zampini c->rmax = 0; 5386ed502f03SStefano Zampini for (i = 0; i < m; i++) { 5387ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 5388ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 5389ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 5390ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 5391ed502f03SStefano Zampini } 53929566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 53939566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 5394ed502f03SStefano Zampini (*C)->nonzerostate++; 53959566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 53969566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 5397ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 5398ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 5399ed502f03SStefano Zampini } else { 540008401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5401ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 5402ed502f03SStefano Zampini if (c->nz) { 5403ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 54045f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5405aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 540608401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 54079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 54089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 54095f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 54105f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5411ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 5412ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5413ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5414aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5415aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5416aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5417aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 54185f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5419ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 5420ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 54219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5422ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5423ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5424ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5425ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5426ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 5427ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5428ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5429ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5430ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5431ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 54329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 54331a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 54345f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5435ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5436ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5437ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5438ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5439ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5440ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5441ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 54421a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5443ed502f03SStefano Zampini } 54449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5445ed502f03SStefano Zampini } 5446ed502f03SStefano Zampini } 54479566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5448ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 5449ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 5450ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5451ed502f03SStefano Zampini PetscFunctionReturn(0); 5452ed502f03SStefano Zampini } 5453c215019aSStefano Zampini 5454c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5455c215019aSStefano Zampini { 5456c215019aSStefano Zampini bool dmem; 5457c215019aSStefano Zampini const PetscScalar *av; 5458c215019aSStefano Zampini 5459c215019aSStefano Zampini PetscFunctionBegin; 5460c215019aSStefano Zampini dmem = isCudaMem(v); 54619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5462c215019aSStefano Zampini if (n && idx) { 5463c215019aSStefano Zampini THRUSTINTARRAY widx(n); 5464c215019aSStefano Zampini widx.assign(idx,idx+n); 54659566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5466c215019aSStefano Zampini 5467c215019aSStefano Zampini THRUSTARRAY *w = NULL; 5468c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 5469c215019aSStefano Zampini if (dmem) { 5470c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 5471c215019aSStefano Zampini } else { 5472c215019aSStefano Zampini w = new THRUSTARRAY(n); 5473c215019aSStefano Zampini dv = w->data(); 5474c215019aSStefano Zampini } 5475c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5476c215019aSStefano Zampini 5477c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5478c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5479c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5480c215019aSStefano Zampini if (w) { 54819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5482c215019aSStefano Zampini } 5483c215019aSStefano Zampini delete w; 5484c215019aSStefano Zampini } else { 54859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5486c215019aSStefano Zampini } 54879566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 54889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5489c215019aSStefano Zampini PetscFunctionReturn(0); 5490c215019aSStefano Zampini } 5491