19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]); 92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 94c215019aSStefano Zampini 95bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 96ca45077fSPaul Mullowney { 97aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 986e111a19SKarl Rupp 99ca45077fSPaul Mullowney PetscFunctionBegin; 100ca45077fSPaul Mullowney switch (op) { 101e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 102aa372e3fSPaul Mullowney cusparsestruct->format = format; 103ca45077fSPaul Mullowney break; 104e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 105aa372e3fSPaul Mullowney cusparsestruct->format = format; 106ca45077fSPaul Mullowney break; 107ca45077fSPaul Mullowney default: 10898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 109ca45077fSPaul Mullowney } 110ca45077fSPaul Mullowney PetscFunctionReturn(0); 111ca45077fSPaul Mullowney } 1129ae82921SPaul Mullowney 113e057df02SPaul Mullowney /*@ 114e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 115e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 116aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 117e057df02SPaul Mullowney Not Collective 118e057df02SPaul Mullowney 119e057df02SPaul Mullowney Input Parameters: 1208468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 12136d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1222692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 123e057df02SPaul Mullowney 124e057df02SPaul Mullowney Output Parameter: 125e057df02SPaul Mullowney 126e057df02SPaul Mullowney Level: intermediate 127e057df02SPaul Mullowney 128db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129e057df02SPaul Mullowney @*/ 130e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 131e057df02SPaul Mullowney { 132e057df02SPaul Mullowney PetscFunctionBegin; 133e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 134cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)); 135e057df02SPaul Mullowney PetscFunctionReturn(0); 136e057df02SPaul Mullowney } 137e057df02SPaul Mullowney 138365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu) 139365b711fSMark Adams { 140365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 141365b711fSMark Adams 142365b711fSMark Adams PetscFunctionBegin; 143365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 144365b711fSMark Adams PetscFunctionReturn(0); 145365b711fSMark Adams } 146365b711fSMark Adams 147365b711fSMark Adams /*@ 148365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 149365b711fSMark Adams 150365b711fSMark Adams Input Parameters: 151365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 152365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 153365b711fSMark Adams 154365b711fSMark Adams Output Parameter: 155365b711fSMark Adams 156365b711fSMark Adams Notes: 157365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 158365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160365b711fSMark Adams 161365b711fSMark Adams Level: intermediate 162365b711fSMark Adams 163db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 164365b711fSMark Adams @*/ 165365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu) 166365b711fSMark Adams { 167365b711fSMark Adams PetscFunctionBegin; 168365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID,1); 169cac4c232SBarry Smith PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)); 170365b711fSMark Adams PetscFunctionReturn(0); 171365b711fSMark Adams } 172365b711fSMark Adams 1731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg) 174e6e9a74fSStefano Zampini { 175e6e9a74fSStefano Zampini PetscFunctionBegin; 1761a2c6b5cSJunchao Zhang switch (op) { 1771a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1781a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1799566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1801a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1811a2c6b5cSJunchao Zhang break; 1821a2c6b5cSJunchao Zhang default: 1839566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 1841a2c6b5cSJunchao Zhang break; 185e6e9a74fSStefano Zampini } 186e6e9a74fSStefano Zampini PetscFunctionReturn(0); 187e6e9a74fSStefano Zampini } 188e6e9a74fSStefano Zampini 189bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 190bddcd29dSMark Adams 191bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 192bddcd29dSMark Adams { 193bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 194bddcd29dSMark Adams IS isrow = b->row,iscol = b->col; 195bddcd29dSMark Adams PetscBool row_identity,col_identity; 196365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr; 197bddcd29dSMark Adams 198bddcd29dSMark Adams PetscFunctionBegin; 1999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2009566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 201bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 202bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2039566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 2049566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 205f93f8571SJunchao Zhang 206365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 207f93f8571SJunchao Zhang if (row_identity && col_identity) { 208bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 209bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 210bddcd29dSMark Adams } else { 211bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 212bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 213365b711fSMark Adams } 214f93f8571SJunchao Zhang } 215bddcd29dSMark Adams B->ops->matsolve = NULL; 216bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 217bddcd29dSMark Adams 218bddcd29dSMark Adams /* get the triangular factors */ 219365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 2209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 221365b711fSMark Adams } 222bddcd29dSMark Adams PetscFunctionReturn(0); 223bddcd29dSMark Adams } 224bddcd29dSMark Adams 2254416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2269ae82921SPaul Mullowney { 227e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2289ae82921SPaul Mullowney PetscBool flg; 229a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2306e111a19SKarl Rupp 2319ae82921SPaul Mullowney PetscFunctionBegin; 232d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options"); 2339ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 234d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 235d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2369566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format)); 237afb2bd1cSJunchao Zhang 238d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 239d0609cedSBarry Smith "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg)); 2409566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 2419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg)); 2429566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve)); 243afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 244d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 245d0609cedSBarry Smith "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg)); 246afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 248aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249a435da06SStefano Zampini #else 250aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251a435da06SStefano Zampini #endif 252d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 253d0609cedSBarry Smith "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg)); 254aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 255afb2bd1cSJunchao Zhang 256d0609cedSBarry Smith PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 257d0609cedSBarry Smith "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg)); 258aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 259afb2bd1cSJunchao Zhang #endif 2604c87dfd4SPaul Mullowney } 261d0609cedSBarry Smith PetscOptionsHeadEnd(); 2629ae82921SPaul Mullowney PetscFunctionReturn(0); 2639ae82921SPaul Mullowney } 2649ae82921SPaul Mullowney 265087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 2669ae82921SPaul Mullowney { 2679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2689ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2699ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 270aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 2719ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 2729ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 2739ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2749ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 2759ae82921SPaul Mullowney 2769ae82921SPaul Mullowney PetscFunctionBegin; 277cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 278c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2799ae82921SPaul Mullowney try { 2809ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2819ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 282da79fbbcSStefano Zampini if (!loTriFactor) { 2832cbc15d9SMark PetscScalar *AALo; 2842cbc15d9SMark 2859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar))); 2869ae82921SPaul Mullowney 2879ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt))); 2899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt))); 2909ae82921SPaul Mullowney 2919ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2929ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 2939ae82921SPaul Mullowney AiLo[n] = nzLower; 2949ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 2959ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 2969ae82921SPaul Mullowney v = aa; 2979ae82921SPaul Mullowney vi = aj; 2989ae82921SPaul Mullowney offset = 1; 2999ae82921SPaul Mullowney rowOffset= 1; 3009ae82921SPaul Mullowney for (i=1; i<n; i++) { 3019ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 302e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3039ae82921SPaul Mullowney AiLo[i] = rowOffset; 3049ae82921SPaul Mullowney rowOffset += nz+1; 3059ae82921SPaul Mullowney 3069566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 3079566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 3089ae82921SPaul Mullowney 3099ae82921SPaul Mullowney offset += nz; 3109ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3119ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3129ae82921SPaul Mullowney offset += 1; 3139ae82921SPaul Mullowney 3149ae82921SPaul Mullowney v += nz; 3159ae82921SPaul Mullowney vi += nz; 3169ae82921SPaul Mullowney } 3172205254eSKarl Rupp 318aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3199566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 320da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 321aa372e3fSPaul Mullowney /* Create the matrix description */ 3229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 3259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 326afb2bd1cSJunchao Zhang #else 3279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 328afb2bd1cSJunchao Zhang #endif 3299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 331aa372e3fSPaul Mullowney 332aa372e3fSPaul Mullowney /* set the operation */ 333aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 334aa372e3fSPaul Mullowney 335aa372e3fSPaul Mullowney /* set the matrix */ 336aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 337aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 338aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 339aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 340aa372e3fSPaul Mullowney 341aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 342aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 343aa372e3fSPaul Mullowney 344aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 345aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 346aa372e3fSPaul Mullowney 347aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 348aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 349aa372e3fSPaul Mullowney 350afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3519566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 352261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3531b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 354261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 355afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 356afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 357afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 3585f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 3599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 360afb2bd1cSJunchao Zhang #endif 361afb2bd1cSJunchao Zhang 362aa372e3fSPaul Mullowney /* perform the solve analysis */ 363261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 364aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 365aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 366d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 3671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 368d49cd2b7SBarry Smith loTriFactor->solveInfo, 3695f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 370d49cd2b7SBarry Smith #else 3715f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 372afb2bd1cSJunchao Zhang #endif 3739566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 3749566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 375aa372e3fSPaul Mullowney 376da79fbbcSStefano Zampini /* assign the pointer */ 377aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 3782cbc15d9SMark loTriFactor->AA_h = AALo; 3799566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 3809566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 3819566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar))); 382da79fbbcSStefano Zampini } else { /* update values only */ 3832cbc15d9SMark if (!loTriFactor->AA_h) { 3849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar))); 3852cbc15d9SMark } 386da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 3872cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 388da79fbbcSStefano Zampini v = aa; 389da79fbbcSStefano Zampini vi = aj; 390da79fbbcSStefano Zampini offset = 1; 391da79fbbcSStefano Zampini for (i=1; i<n; i++) { 392da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 3939566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 394da79fbbcSStefano Zampini offset += nz; 3952cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 396da79fbbcSStefano Zampini offset += 1; 397da79fbbcSStefano Zampini v += nz; 398da79fbbcSStefano Zampini } 3992cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 4009566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar))); 401da79fbbcSStefano Zampini } 4029ae82921SPaul Mullowney } catch(char *ex) { 40398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4049ae82921SPaul Mullowney } 4059ae82921SPaul Mullowney } 4069ae82921SPaul Mullowney PetscFunctionReturn(0); 4079ae82921SPaul Mullowney } 4089ae82921SPaul Mullowney 409087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4109ae82921SPaul Mullowney { 4119ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4129ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4139ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4159ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4169ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4179ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4189ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4199ae82921SPaul Mullowney 4209ae82921SPaul Mullowney PetscFunctionBegin; 421cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 422c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4239ae82921SPaul Mullowney try { 4249ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4259ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 426da79fbbcSStefano Zampini if (!upTriFactor) { 4272cbc15d9SMark PetscScalar *AAUp; 4282cbc15d9SMark 4299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 4302cbc15d9SMark 4319ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 4339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 4349ae82921SPaul Mullowney 4359ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4369ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4379ae82921SPaul Mullowney AiUp[n]=nzUpper; 4389ae82921SPaul Mullowney offset = nzUpper; 4399ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4409ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4419ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4429ae82921SPaul Mullowney 443e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4449ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4459ae82921SPaul Mullowney 446e057df02SPaul Mullowney /* decrement the offset */ 4479ae82921SPaul Mullowney offset -= (nz+1); 4489ae82921SPaul Mullowney 449e057df02SPaul Mullowney /* first, set the diagonal elements */ 4509ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 45109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4529ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4539ae82921SPaul Mullowney 4549566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz)); 4559566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz)); 4569ae82921SPaul Mullowney } 4572205254eSKarl Rupp 458aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4599566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 460da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4612205254eSKarl Rupp 462aa372e3fSPaul Mullowney /* Create the matrix description */ 4639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 4669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 467afb2bd1cSJunchao Zhang #else 4689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 469afb2bd1cSJunchao Zhang #endif 4709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 472aa372e3fSPaul Mullowney 473aa372e3fSPaul Mullowney /* set the operation */ 474aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 475aa372e3fSPaul Mullowney 476aa372e3fSPaul Mullowney /* set the matrix */ 477aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 478aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 479aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 480aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 481aa372e3fSPaul Mullowney 482aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 483aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 484aa372e3fSPaul Mullowney 485aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 486aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 487aa372e3fSPaul Mullowney 488aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 489aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 490aa372e3fSPaul Mullowney 491afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4929566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 493261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 4941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 495261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 496afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 497afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 498afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 4995f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 5009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 501afb2bd1cSJunchao Zhang #endif 502afb2bd1cSJunchao Zhang 503aa372e3fSPaul Mullowney /* perform the solve analysis */ 504261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 505aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 506aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 507d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 5081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 509d49cd2b7SBarry Smith upTriFactor->solveInfo, 5105f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 511d49cd2b7SBarry Smith #else 5125f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 513afb2bd1cSJunchao Zhang #endif 5149566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5159566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 516aa372e3fSPaul Mullowney 517da79fbbcSStefano Zampini /* assign the pointer */ 518aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5192cbc15d9SMark upTriFactor->AA_h = AAUp; 5209566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5219566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5229566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar))); 523da79fbbcSStefano Zampini } else { 5242cbc15d9SMark if (!upTriFactor->AA_h) { 5259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar))); 5262cbc15d9SMark } 527da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 528da79fbbcSStefano Zampini offset = nzUpper; 529da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 530da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 531da79fbbcSStefano Zampini 532da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 533da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 534da79fbbcSStefano Zampini 535da79fbbcSStefano Zampini /* decrement the offset */ 536da79fbbcSStefano Zampini offset -= (nz+1); 537da79fbbcSStefano Zampini 538da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5392cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 5409566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz)); 541da79fbbcSStefano Zampini } 5422cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 5439566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar))); 544da79fbbcSStefano Zampini } 5459ae82921SPaul Mullowney } catch(char *ex) { 54698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5479ae82921SPaul Mullowney } 5489ae82921SPaul Mullowney } 5499ae82921SPaul Mullowney PetscFunctionReturn(0); 5509ae82921SPaul Mullowney } 5519ae82921SPaul Mullowney 552087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5539ae82921SPaul Mullowney { 5549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5559ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 5569ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 5579ae82921SPaul Mullowney PetscBool row_identity,col_identity; 5589ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5599ae82921SPaul Mullowney 5609ae82921SPaul Mullowney PetscFunctionBegin; 56128b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 5629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 5642205254eSKarl Rupp 565da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 566aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 5679ae82921SPaul Mullowney 568c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 569e057df02SPaul Mullowney /* lower triangular indices */ 5709566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 571da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 572da79fbbcSStefano Zampini const PetscInt *r; 573da79fbbcSStefano Zampini 5749566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r)); 575aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 576aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 5779566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r)); 5789566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 579da79fbbcSStefano Zampini } 5809ae82921SPaul Mullowney 581e057df02SPaul Mullowney /* upper triangular indices */ 5829566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol,&col_identity)); 583da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 584da79fbbcSStefano Zampini const PetscInt *c; 585da79fbbcSStefano Zampini 5869566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol,&c)); 587aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 588aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 5899566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol,&c)); 5909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 591da79fbbcSStefano Zampini } 5929ae82921SPaul Mullowney PetscFunctionReturn(0); 5939ae82921SPaul Mullowney } 5949ae82921SPaul Mullowney 595087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 596087f3262SPaul Mullowney { 597087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 598087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 599aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 600aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 601087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 602087f3262SPaul Mullowney PetscScalar *AAUp; 603087f3262SPaul Mullowney PetscScalar *AALo; 604087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 605087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 606087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 607087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 608087f3262SPaul Mullowney 609087f3262SPaul Mullowney PetscFunctionBegin; 610cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 611c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 612087f3262SPaul Mullowney try { 6139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar))); 6149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar))); 615da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 616087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 6179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt))); 6189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt))); 619087f3262SPaul Mullowney 620087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 621087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 622087f3262SPaul Mullowney AiUp[n]=nzUpper; 623087f3262SPaul Mullowney offset = 0; 624087f3262SPaul Mullowney for (i=0; i<n; i++) { 625087f3262SPaul Mullowney /* set the pointers */ 626087f3262SPaul Mullowney v = aa + ai[i]; 627087f3262SPaul Mullowney vj = aj + ai[i]; 628087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 629087f3262SPaul Mullowney 630087f3262SPaul Mullowney /* first, set the diagonal elements */ 631087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 63209f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 633087f3262SPaul Mullowney AiUp[i] = offset; 63409f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 635087f3262SPaul Mullowney 636087f3262SPaul Mullowney offset+=1; 637087f3262SPaul Mullowney if (nz>0) { 6389566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 6399566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 640087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 641087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 642087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 643087f3262SPaul Mullowney } 644087f3262SPaul Mullowney offset+=nz; 645087f3262SPaul Mullowney } 646087f3262SPaul Mullowney } 647087f3262SPaul Mullowney 648aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6499566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 650da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 651087f3262SPaul Mullowney 652aa372e3fSPaul Mullowney /* Create the matrix description */ 6539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 6569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 657afb2bd1cSJunchao Zhang #else 6589566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 659afb2bd1cSJunchao Zhang #endif 6609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 662087f3262SPaul Mullowney 663aa372e3fSPaul Mullowney /* set the matrix */ 664aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 665aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 666aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 667aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 668aa372e3fSPaul Mullowney 669aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 670aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 671aa372e3fSPaul Mullowney 672aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 673aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 674aa372e3fSPaul Mullowney 675aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 676aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 677aa372e3fSPaul Mullowney 678afb2bd1cSJunchao Zhang /* set the operation */ 679afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 680afb2bd1cSJunchao Zhang 681afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6829566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 683261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 6841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 685261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 686afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 687afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 688afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 6895f80ce2aSJacob Faibussowitsch &upTriFactor->solveBufferSize)); 6909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize)); 691afb2bd1cSJunchao Zhang #endif 692afb2bd1cSJunchao Zhang 693aa372e3fSPaul Mullowney /* perform the solve analysis */ 694261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 695aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 696aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 697d49cd2b7SBarry Smith upTriFactor->csrMat->column_indices->data().get(), 6981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 699d49cd2b7SBarry Smith upTriFactor->solveInfo, 7005f80ce2aSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 701d49cd2b7SBarry Smith #else 7025f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 703afb2bd1cSJunchao Zhang #endif 7049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7059566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 706aa372e3fSPaul Mullowney 707da79fbbcSStefano Zampini /* assign the pointer */ 708aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 709aa372e3fSPaul Mullowney 710aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 7119566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 712da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 713aa372e3fSPaul Mullowney 714aa372e3fSPaul Mullowney /* Create the matrix description */ 7159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 7169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 7171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 7189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 719afb2bd1cSJunchao Zhang #else 7209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 721afb2bd1cSJunchao Zhang #endif 7229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 724aa372e3fSPaul Mullowney 725aa372e3fSPaul Mullowney /* set the operation */ 726aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 727aa372e3fSPaul Mullowney 728aa372e3fSPaul Mullowney /* set the matrix */ 729aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 730aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 731aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 732aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 733aa372e3fSPaul Mullowney 734aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 735aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 736aa372e3fSPaul Mullowney 737aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 738aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 739aa372e3fSPaul Mullowney 740aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 741aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 742aa372e3fSPaul Mullowney 743afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7449566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 745261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 7461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 747261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 748afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 749afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 750afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 7515f80ce2aSJacob Faibussowitsch &loTriFactor->solveBufferSize)); 7529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize)); 753afb2bd1cSJunchao Zhang #endif 754afb2bd1cSJunchao Zhang 755aa372e3fSPaul Mullowney /* perform the solve analysis */ 756261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 757aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 758aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 759d49cd2b7SBarry Smith loTriFactor->csrMat->column_indices->data().get(), 7601b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 761d49cd2b7SBarry Smith loTriFactor->solveInfo, 7625f80ce2aSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 763d49cd2b7SBarry Smith #else 7645f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 765afb2bd1cSJunchao Zhang #endif 7669566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7679566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 768aa372e3fSPaul Mullowney 769da79fbbcSStefano Zampini /* assign the pointer */ 770aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 771087f3262SPaul Mullowney 7729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)))); 7739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 7749566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 775da79fbbcSStefano Zampini } else { 776da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 777da79fbbcSStefano Zampini offset = 0; 778da79fbbcSStefano Zampini for (i=0; i<n; i++) { 779da79fbbcSStefano Zampini /* set the pointers */ 780da79fbbcSStefano Zampini v = aa + ai[i]; 781da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 782da79fbbcSStefano Zampini 783da79fbbcSStefano Zampini /* first, set the diagonal elements */ 784da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 785da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 786da79fbbcSStefano Zampini 787da79fbbcSStefano Zampini offset+=1; 788da79fbbcSStefano Zampini if (nz>0) { 7899566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 790da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 791da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 792da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 793da79fbbcSStefano Zampini } 794da79fbbcSStefano Zampini offset+=nz; 795da79fbbcSStefano Zampini } 796da79fbbcSStefano Zampini } 79728b400f6SJacob Faibussowitsch PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 79828b400f6SJacob Faibussowitsch PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 799da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 800da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 8019566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar))); 802da79fbbcSStefano Zampini } 8039566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 8049566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 805087f3262SPaul Mullowney } catch(char *ex) { 80698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 807087f3262SPaul Mullowney } 808087f3262SPaul Mullowney } 809087f3262SPaul Mullowney PetscFunctionReturn(0); 810087f3262SPaul Mullowney } 811087f3262SPaul Mullowney 812087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8139ae82921SPaul Mullowney { 814087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 815087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 816087f3262SPaul Mullowney IS ip = a->row; 817087f3262SPaul Mullowney PetscBool perm_identity; 818087f3262SPaul Mullowney PetscInt n = A->rmap->n; 819087f3262SPaul Mullowney 820087f3262SPaul Mullowney PetscFunctionBegin; 82128b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 8229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 823da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 824aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 825aa372e3fSPaul Mullowney 826da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 827da79fbbcSStefano Zampini 828087f3262SPaul Mullowney /* lower triangular indices */ 8299566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 830087f3262SPaul Mullowney if (!perm_identity) { 8314e4bbfaaSStefano Zampini IS iip; 832da79fbbcSStefano Zampini const PetscInt *irip,*rip; 8334e4bbfaaSStefano Zampini 8349566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip)); 8359566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip,&irip)); 8369566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip,&rip)); 837aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 838aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 839aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8404e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8419566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip,&irip)); 8429566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 8439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip,&rip)); 8449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 845da79fbbcSStefano Zampini } 846087f3262SPaul Mullowney PetscFunctionReturn(0); 847087f3262SPaul Mullowney } 848087f3262SPaul Mullowney 849087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 850087f3262SPaul Mullowney { 851087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 852087f3262SPaul Mullowney IS ip = b->row; 853087f3262SPaul Mullowney PetscBool perm_identity; 854087f3262SPaul Mullowney 855087f3262SPaul Mullowney PetscFunctionBegin; 8569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 8579566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info)); 858ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 859087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8609566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip,&perm_identity)); 861087f3262SPaul Mullowney if (perm_identity) { 862087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 863087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8644e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8654e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 866087f3262SPaul Mullowney } else { 867087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 868087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8694e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8704e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 871087f3262SPaul Mullowney } 872087f3262SPaul Mullowney 873087f3262SPaul Mullowney /* get the triangular factors */ 8749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 875087f3262SPaul Mullowney PetscFunctionReturn(0); 876087f3262SPaul Mullowney } 8779ae82921SPaul Mullowney 878b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 879bda325fcSPaul Mullowney { 880bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 881aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 882aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 883da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 884da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 885aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 886aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 887aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 888aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 889b175d8bbSPaul Mullowney 890bda325fcSPaul Mullowney PetscFunctionBegin; 891aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 8929566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 893da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 894aa372e3fSPaul Mullowney 895aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 896aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 897aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 898aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 899aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 900aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 901aa372e3fSPaul Mullowney 902aa372e3fSPaul Mullowney /* Create the matrix description */ 9039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 9049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 9059566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 9069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 9079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 908aa372e3fSPaul Mullowney 909aa372e3fSPaul Mullowney /* set the operation */ 910aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 911aa372e3fSPaul Mullowney 912aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 913aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 914afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 915afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 916aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 917afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 918afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 919afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 920aa372e3fSPaul Mullowney 921aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 922afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 9239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 924afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 925afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 926afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 927afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 928afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 929afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 930afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 9315f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 9329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize)); 933afb2bd1cSJunchao Zhang #endif 934afb2bd1cSJunchao Zhang 9359566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 9369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 937aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 938aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 939aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 940aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 941aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 942afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 943afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 944afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 9455f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 946afb2bd1cSJunchao Zhang #else 947afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 9485f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 949afb2bd1cSJunchao Zhang #endif 9509566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9519566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 952aa372e3fSPaul Mullowney 953afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9549566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 955261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 9561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 957261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 958afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 959afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 960afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 9615f80ce2aSJacob Faibussowitsch &loTriFactorT->solveBufferSize)); 9629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize)); 963afb2bd1cSJunchao Zhang #endif 964afb2bd1cSJunchao Zhang 965afb2bd1cSJunchao Zhang /* perform the solve analysis */ 966261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 967afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 968afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 969d49cd2b7SBarry Smith loTriFactorT->csrMat->column_indices->data().get(), 9701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 971d49cd2b7SBarry Smith loTriFactorT->solveInfo, 9725f80ce2aSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 973d49cd2b7SBarry Smith #else 9745f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 975afb2bd1cSJunchao Zhang #endif 9769566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9779566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 978aa372e3fSPaul Mullowney 979da79fbbcSStefano Zampini /* assign the pointer */ 980aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 981aa372e3fSPaul Mullowney 982aa372e3fSPaul Mullowney /*********************************************/ 983aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 984aa372e3fSPaul Mullowney /*********************************************/ 985aa372e3fSPaul Mullowney 986aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 9879566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 988da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 989aa372e3fSPaul Mullowney 990aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 991aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 992aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 993aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 994aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 995aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 996aa372e3fSPaul Mullowney 997aa372e3fSPaul Mullowney /* Create the matrix description */ 9989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 9999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 10009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 10019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 10029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1003aa372e3fSPaul Mullowney 1004aa372e3fSPaul Mullowney /* set the operation */ 1005aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1006aa372e3fSPaul Mullowney 1007aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1008aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1009afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1010afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1011aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1012afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1013afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1014afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1015aa372e3fSPaul Mullowney 1016aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1017afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 10189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1019afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1020afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1021afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1022afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1023afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1024afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1025afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 10265f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 10279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize)); 1028afb2bd1cSJunchao Zhang #endif 1029afb2bd1cSJunchao Zhang 10309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 10319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1032aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1033aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1034aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1035aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1036aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1038afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1039afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 10405f80ce2aSJacob Faibussowitsch CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 1041afb2bd1cSJunchao Zhang #else 1042afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 10435f80ce2aSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase)); 1044afb2bd1cSJunchao Zhang #endif 1045d49cd2b7SBarry Smith 10469566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10479566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1048aa372e3fSPaul Mullowney 1049afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 10509566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1051261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 10521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1053261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1054afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1055afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1056afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 10575f80ce2aSJacob Faibussowitsch &upTriFactorT->solveBufferSize)); 10589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize)); 1059afb2bd1cSJunchao Zhang #endif 1060afb2bd1cSJunchao Zhang 1061afb2bd1cSJunchao Zhang /* perform the solve analysis */ 10625f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 1063261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1064afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1065afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1066d49cd2b7SBarry Smith upTriFactorT->csrMat->column_indices->data().get(), 10671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1068d49cd2b7SBarry Smith upTriFactorT->solveInfo, 10695f80ce2aSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1070d49cd2b7SBarry Smith #else 10715f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1072afb2bd1cSJunchao Zhang #endif 1073d49cd2b7SBarry Smith 10749566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10759566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0)); 1076aa372e3fSPaul Mullowney 1077da79fbbcSStefano Zampini /* assign the pointer */ 1078aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1079bda325fcSPaul Mullowney PetscFunctionReturn(0); 1080bda325fcSPaul Mullowney } 1081bda325fcSPaul Mullowney 1082a49f1ed0SStefano Zampini struct PetscScalarToPetscInt 1083a49f1ed0SStefano Zampini { 1084a49f1ed0SStefano Zampini __host__ __device__ 1085a49f1ed0SStefano Zampini PetscInt operator()(PetscScalar s) 1086a49f1ed0SStefano Zampini { 1087a49f1ed0SStefano Zampini return (PetscInt)PetscRealPart(s); 1088a49f1ed0SStefano Zampini } 1089a49f1ed0SStefano Zampini }; 1090a49f1ed0SStefano Zampini 10913606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1092bda325fcSPaul Mullowney { 1093aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1094a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1095bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1096bda325fcSPaul Mullowney cusparseStatus_t stat; 1097aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1098b175d8bbSPaul Mullowney 1099bda325fcSPaul Mullowney PetscFunctionBegin; 11009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1101a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 110228b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct"); 1103a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 110408401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct"); 11051a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 11069566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 11079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1108a49f1ed0SStefano Zampini if (cusparsestruct->format != MAT_CUSPARSE_CSR) { 11099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 1110a49f1ed0SStefano Zampini } 1111a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1112aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 11139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1114aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 11159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 11169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1117aa372e3fSPaul Mullowney 1118b06137fdSPaul Mullowney /* set alpha and beta */ 11199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar))); 11209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar))); 11219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 11229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 11249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 1125b06137fdSPaul Mullowney 1126aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1127aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1128a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1129554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1130554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1131aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1132a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1133aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1134aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1135a3fdcf43SKarl Rupp 1136039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); } 113781902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1138afb2bd1cSJunchao Zhang 1139afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 11403606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,2,1) 1141afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1142afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1143afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1144afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1145afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 11469566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 11473606e59fSJunchao Zhang #else 11483606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 11493606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 11503606e59fSJunchao Zhang 11513606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 11523606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 11533606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 11543606e59fSJunchao Zhang */ 11553606e59fSJunchao Zhang if (matrixT->num_entries) { 11563606e59fSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 11573606e59fSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 11583606e59fSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 11593606e59fSJunchao Zhang matrixT->values->data().get(), 11603606e59fSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, 11619566063dSJacob Faibussowitsch indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat); 11623606e59fSJunchao Zhang 11633606e59fSJunchao Zhang } else { 11643606e59fSJunchao Zhang matstructT->matDescr = NULL; 11653606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 11663606e59fSJunchao Zhang } 11673606e59fSJunchao Zhang #endif 1168afb2bd1cSJunchao Zhang #endif 1169aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1170afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1171afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1172afb2bd1cSJunchao Zhang #else 1173aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 117451c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 117551c6d536SStefano Zampini /* First convert HYB to CSR */ 1176aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1177aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1178aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1179aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1180aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1181aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1182aa372e3fSPaul Mullowney 1183aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1184aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1185aa372e3fSPaul Mullowney temp->values->data().get(), 1186aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 11879566063dSJacob Faibussowitsch temp->column_indices->data().get());PetscCallCUSPARSE(stat); 1188aa372e3fSPaul Mullowney 1189aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1190aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1191aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1192aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1193aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1194aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1195aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1196aa372e3fSPaul Mullowney 1197aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1198aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1199aa372e3fSPaul Mullowney temp->values->data().get(), 1200aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1201aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1202aa372e3fSPaul Mullowney tempT->values->data().get(), 1203aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1204aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 12059566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1206aa372e3fSPaul Mullowney 1207aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1208aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 12099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1210aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1211aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1212aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1213aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1214aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1215aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 12169566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 1217aa372e3fSPaul Mullowney 1218aa372e3fSPaul Mullowney /* assign the pointer */ 1219aa372e3fSPaul Mullowney matstructT->mat = hybMat; 12201a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1221aa372e3fSPaul Mullowney /* delete temporaries */ 1222aa372e3fSPaul Mullowney if (tempT) { 1223aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1224aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1225aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1226aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1227087f3262SPaul Mullowney } 1228aa372e3fSPaul Mullowney if (temp) { 1229aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1230aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1231aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1232aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1233aa372e3fSPaul Mullowney } 1234afb2bd1cSJunchao Zhang #endif 1235aa372e3fSPaul Mullowney } 1236a49f1ed0SStefano Zampini } 1237a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1238a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1239a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat; 124028b400f6SJacob Faibussowitsch PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix"); 124128b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows"); 124228b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols"); 124328b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values"); 124428b400f6SJacob Faibussowitsch PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT"); 124528b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows"); 124628b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols"); 124728b400f6SJacob Faibussowitsch PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values"); 1248a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1249a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1250a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 12519566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 1252a49f1ed0SStefano Zampini } 1253a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1254a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1255a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1256a49f1ed0SStefano Zampini 1257a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1258a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1259a49f1ed0SStefano Zampini void *csr2cscBuffer; 1260a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 1261a49f1ed0SStefano Zampini stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1262a49f1ed0SStefano Zampini A->cmap->n, matrix->num_entries, 1263a49f1ed0SStefano Zampini matrix->values->data().get(), 1264a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->data().get(), 1265a49f1ed0SStefano Zampini matrix->column_indices->data().get(), 1266a49f1ed0SStefano Zampini matrixT->values->data().get(), 1267a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1268a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 12699566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat); 12709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize)); 1271a49f1ed0SStefano Zampini #endif 1272a49f1ed0SStefano Zampini 12731a2c6b5cSJunchao Zhang if (matrix->num_entries) { 12741a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 12751a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 12761a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 12771a2c6b5cSJunchao Zhang 12781a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 12791a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 12801a2c6b5cSJunchao Zhang */ 12811a2c6b5cSJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 12821a2c6b5cSJunchao Zhang A->cmap->n,matrix->num_entries, 12831a2c6b5cSJunchao Zhang csr2csc_a.data().get(), 12841a2c6b5cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 12851a2c6b5cSJunchao Zhang matrix->column_indices->data().get(), 1286a49f1ed0SStefano Zampini matrixT->values->data().get(), 1287a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1288a49f1ed0SStefano Zampini matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1289a49f1ed0SStefano Zampini CUSPARSE_ACTION_NUMERIC,indexBase, 12909566063dSJacob Faibussowitsch cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat); 1291a49f1ed0SStefano Zampini #else 1292a49f1ed0SStefano Zampini matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 12939566063dSJacob Faibussowitsch CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat); 1294a49f1ed0SStefano Zampini #endif 12951a2c6b5cSJunchao Zhang } else { 12961a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase); 12971a2c6b5cSJunchao Zhang } 12981a2c6b5cSJunchao Zhang 1299a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1300a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt())); 1301a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 13029566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1303a49f1ed0SStefano Zampini #endif 1304a49f1ed0SStefano Zampini } 1305a49f1ed0SStefano Zampini PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), 1306a49f1ed0SStefano Zampini thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), 1307a49f1ed0SStefano Zampini matrixT->values->begin())); 1308a49f1ed0SStefano Zampini } 13099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13109566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0)); 1311213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1312213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1313aa372e3fSPaul Mullowney /* assign the pointer */ 1314aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 13151a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1316bda325fcSPaul Mullowney PetscFunctionReturn(0); 1317bda325fcSPaul Mullowney } 1318bda325fcSPaul Mullowney 1319a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13206fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1321bda325fcSPaul Mullowney { 1322c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1323465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1324465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1325465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1326465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1327bda325fcSPaul Mullowney cusparseStatus_t stat; 1328bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1329aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1330aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1331aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1332bda325fcSPaul Mullowney 1333bda325fcSPaul Mullowney PetscFunctionBegin; 1334aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1335aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 13369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1337aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1338aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1339bda325fcSPaul Mullowney } 1340bda325fcSPaul Mullowney 1341bda325fcSPaul Mullowney /* Get the GPU pointers */ 13429566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 13439566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1344c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1345c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1346bda325fcSPaul Mullowney 13479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1348aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1349a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1350c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1351c41cb2e2SAlejandro Lamas Daviña xGPU); 1352aa372e3fSPaul Mullowney 1353aa372e3fSPaul Mullowney /* First, solve U */ 1354261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1355afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 13561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1357afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1358afb2bd1cSJunchao Zhang #endif 1359afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1360aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1361aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1362aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1363aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1364d49cd2b7SBarry Smith xarray, 13651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1366d49cd2b7SBarry Smith tempGPU->data().get(), 13679566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1368d49cd2b7SBarry Smith #else 13699566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1370afb2bd1cSJunchao Zhang #endif 1371aa372e3fSPaul Mullowney 1372aa372e3fSPaul Mullowney /* Then, solve L */ 1373261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1374afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 13751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1376afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1377afb2bd1cSJunchao Zhang #endif 1378afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1379aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1380aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1381aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1382aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1383d49cd2b7SBarry Smith tempGPU->data().get(), 13841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1385d49cd2b7SBarry Smith xarray, 13869566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1387d49cd2b7SBarry Smith #else 13889566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1389afb2bd1cSJunchao Zhang #endif 1390aa372e3fSPaul Mullowney 1391aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1392a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1393c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1394aa372e3fSPaul Mullowney tempGPU->begin()); 1395aa372e3fSPaul Mullowney 1396aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1397a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU); 1398bda325fcSPaul Mullowney 1399bda325fcSPaul Mullowney /* restore */ 14009566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 14019566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 14029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1404bda325fcSPaul Mullowney PetscFunctionReturn(0); 1405bda325fcSPaul Mullowney } 1406bda325fcSPaul Mullowney 14076fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1408bda325fcSPaul Mullowney { 1409465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1410465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1411bda325fcSPaul Mullowney cusparseStatus_t stat; 1412bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1413aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1415aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1416bda325fcSPaul Mullowney 1417bda325fcSPaul Mullowney PetscFunctionBegin; 1418aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1419aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 14209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1421aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1422aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1423bda325fcSPaul Mullowney } 1424bda325fcSPaul Mullowney 1425bda325fcSPaul Mullowney /* Get the GPU pointers */ 14269566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14279566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1428bda325fcSPaul Mullowney 14299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1430aa372e3fSPaul Mullowney /* First, solve U */ 1431261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1432afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1434afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1435afb2bd1cSJunchao Zhang #endif 1436afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1437aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1438aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1439aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1440aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1441d49cd2b7SBarry Smith barray, 14421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1443d49cd2b7SBarry Smith tempGPU->data().get(), 14449566063dSJacob Faibussowitsch upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1445d49cd2b7SBarry Smith #else 14469566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1447afb2bd1cSJunchao Zhang #endif 1448aa372e3fSPaul Mullowney 1449aa372e3fSPaul Mullowney /* Then, solve L */ 1450261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1451afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1453afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1454afb2bd1cSJunchao Zhang #endif 1455afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1456aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1457aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1458aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1459aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1460d49cd2b7SBarry Smith tempGPU->data().get(), 14611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1462d49cd2b7SBarry Smith xarray, 14639566063dSJacob Faibussowitsch loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat); 1464d49cd2b7SBarry Smith #else 14659566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1466afb2bd1cSJunchao Zhang #endif 1467bda325fcSPaul Mullowney 1468bda325fcSPaul Mullowney /* restore */ 14699566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 14709566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 14719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 1473bda325fcSPaul Mullowney PetscFunctionReturn(0); 1474bda325fcSPaul Mullowney } 1475bda325fcSPaul Mullowney 14766fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 14779ae82921SPaul Mullowney { 1478465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1479465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1480465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1481465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 14829ae82921SPaul Mullowney cusparseStatus_t stat; 14839ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1484aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1485aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1486aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 14879ae82921SPaul Mullowney 14889ae82921SPaul Mullowney PetscFunctionBegin; 1489ebc8f436SDominic Meiser 1490e057df02SPaul Mullowney /* Get the GPU pointers */ 14919566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 14929566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 1493c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1494c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 14959ae82921SPaul Mullowney 14969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1497aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1498a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1499c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15004e4bbfaaSStefano Zampini tempGPU->begin()); 1501aa372e3fSPaul Mullowney 1502aa372e3fSPaul Mullowney /* Next, solve L */ 1503261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1504afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1506afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1507afb2bd1cSJunchao Zhang #endif 1508afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1509aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1510aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1511aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1512aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1513d49cd2b7SBarry Smith tempGPU->data().get(), 15141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1515d49cd2b7SBarry Smith xarray, 15169566063dSJacob Faibussowitsch loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1517d49cd2b7SBarry Smith #else 15189566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1519afb2bd1cSJunchao Zhang #endif 1520aa372e3fSPaul Mullowney 1521aa372e3fSPaul Mullowney /* Then, solve U */ 1522261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1523afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1525afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1526afb2bd1cSJunchao Zhang #endif 1527afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1528aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1529aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1530aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1531d49cd2b7SBarry Smith upTriFactor->solveInfo,xarray, 15321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1533d49cd2b7SBarry Smith tempGPU->data().get(), 15349566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1535d49cd2b7SBarry Smith #else 15369566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1537afb2bd1cSJunchao Zhang #endif 1538d49cd2b7SBarry Smith 15394e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 1540a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15414e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15424e4bbfaaSStefano Zampini xGPU); 15439ae82921SPaul Mullowney 15449566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 15459566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 15469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 15489ae82921SPaul Mullowney PetscFunctionReturn(0); 15499ae82921SPaul Mullowney } 15509ae82921SPaul Mullowney 15516fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15529ae82921SPaul Mullowney { 1553465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1554465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15559ae82921SPaul Mullowney cusparseStatus_t stat; 15569ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1557aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1558aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1559aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 15609ae82921SPaul Mullowney 15619ae82921SPaul Mullowney PetscFunctionBegin; 1562e057df02SPaul Mullowney /* Get the GPU pointers */ 15639566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx,&xarray)); 15649566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb,&barray)); 15659ae82921SPaul Mullowney 15669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1567aa372e3fSPaul Mullowney /* First, solve L */ 1568261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1569afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15701b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1571afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1572afb2bd1cSJunchao Zhang #endif 1573afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1574aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1575aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1576aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1577aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1578d49cd2b7SBarry Smith barray, 15791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1580d49cd2b7SBarry Smith tempGPU->data().get(), 15819566063dSJacob Faibussowitsch loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1582d49cd2b7SBarry Smith #else 15839566063dSJacob Faibussowitsch tempGPU->data().get());PetscCallCUSPARSE(stat); 1584afb2bd1cSJunchao Zhang #endif 1585d49cd2b7SBarry Smith 1586aa372e3fSPaul Mullowney /* Next, solve U */ 1587261a78b4SJunchao Zhang stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1588afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1590afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1591afb2bd1cSJunchao Zhang #endif 1592afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1593aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1594aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1595aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1596aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1597d49cd2b7SBarry Smith tempGPU->data().get(), 15981b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1599d49cd2b7SBarry Smith xarray, 16009566063dSJacob Faibussowitsch upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat); 1601d49cd2b7SBarry Smith #else 16029566063dSJacob Faibussowitsch xarray);PetscCallCUSPARSE(stat); 1603afb2bd1cSJunchao Zhang #endif 16049ae82921SPaul Mullowney 16059566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb,&barray)); 16069566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx,&xarray)); 16079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n)); 16099ae82921SPaul Mullowney PetscFunctionReturn(0); 16109ae82921SPaul Mullowney } 16119ae82921SPaul Mullowney 1612*da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1613*da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1614*da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1615*da112707SJunchao Zhang { 1616*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1617*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1618*da112707SJunchao Zhang const PetscScalar *barray; 1619*da112707SJunchao Zhang PetscScalar *xarray; 1620*da112707SJunchao Zhang 1621*da112707SJunchao Zhang PetscFunctionBegin; 1622*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1623*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 1624*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1625*da112707SJunchao Zhang 1626*da112707SJunchao Zhang /* Solve L*y = b */ 1627*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1628*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1629*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1630*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1631*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1632*da112707SJunchao Zhang fs->spMatDescr_L, /* L Y = X */ 1633*da112707SJunchao Zhang fs->dnVecDescr_X, 1634*da112707SJunchao Zhang fs->dnVecDescr_Y, 1635*da112707SJunchao Zhang cusparse_scalartype, 1636*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1637*da112707SJunchao Zhang fs->spsvDescr_L)); 1638*da112707SJunchao Zhang 1639*da112707SJunchao Zhang /* Solve U*x = y */ 1640*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1641*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1642*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1643*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1644*da112707SJunchao Zhang fs->spMatDescr_U, /* U X = Y */ 1645*da112707SJunchao Zhang fs->dnVecDescr_Y, 1646*da112707SJunchao Zhang fs->dnVecDescr_X, 1647*da112707SJunchao Zhang cusparse_scalartype, 1648*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1649*da112707SJunchao Zhang fs->spsvDescr_U)); 1650*da112707SJunchao Zhang 1651*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1652*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1653*da112707SJunchao Zhang 1654*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1655*da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1656*da112707SJunchao Zhang PetscFunctionReturn(0); 1657*da112707SJunchao Zhang } 1658*da112707SJunchao Zhang 1659*da112707SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x) 1660*da112707SJunchao Zhang { 1661*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1662*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1663*da112707SJunchao Zhang const PetscScalar *barray; 1664*da112707SJunchao Zhang PetscScalar *xarray; 1665*da112707SJunchao Zhang 1666*da112707SJunchao Zhang PetscFunctionBegin; 1667*da112707SJunchao Zhang if (!fs->builtSolveTranspose) { /* Call MatSolveTranspose() for the first time */ 1668*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1669*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1670*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1671*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1672*da112707SJunchao Zhang fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 1673*da112707SJunchao Zhang fs->dnVecDescr_X, 1674*da112707SJunchao Zhang fs->dnVecDescr_Y, 1675*da112707SJunchao Zhang cusparse_scalartype, 1676*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1677*da112707SJunchao Zhang fs->spsvDescr_Lt, 1678*da112707SJunchao Zhang &fs->spsvBufferSize_Lt)); 1679*da112707SJunchao Zhang 1680*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1681*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1682*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1683*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1684*da112707SJunchao Zhang fs->spMatDescr_U, 1685*da112707SJunchao Zhang fs->dnVecDescr_X, 1686*da112707SJunchao Zhang fs->dnVecDescr_Y, 1687*da112707SJunchao Zhang cusparse_scalartype, 1688*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1689*da112707SJunchao Zhang fs->spsvDescr_Ut, 1690*da112707SJunchao Zhang &fs->spsvBufferSize_Ut)); 1691*da112707SJunchao Zhang 1692*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut)); 1693*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 1694*da112707SJunchao Zhang 1695*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1696*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1697*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1698*da112707SJunchao Zhang fs->spMatDescr_L, 1699*da112707SJunchao Zhang fs->dnVecDescr_X, 1700*da112707SJunchao Zhang fs->dnVecDescr_Y, 1701*da112707SJunchao Zhang cusparse_scalartype, 1702*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1703*da112707SJunchao Zhang fs->spsvDescr_Lt, 1704*da112707SJunchao Zhang fs->spsvBuffer_Lt)); 1705*da112707SJunchao Zhang 1706*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1707*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1708*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1709*da112707SJunchao Zhang fs->spMatDescr_U, 1710*da112707SJunchao Zhang fs->dnVecDescr_X, 1711*da112707SJunchao Zhang fs->dnVecDescr_Y, 1712*da112707SJunchao Zhang cusparse_scalartype, 1713*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1714*da112707SJunchao Zhang fs->spsvDescr_Ut, 1715*da112707SJunchao Zhang fs->spsvBuffer_Ut)); 1716*da112707SJunchao Zhang fs->builtSolveTranspose = PETSC_TRUE; 1717*da112707SJunchao Zhang } 1718*da112707SJunchao Zhang 1719*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 1720*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 1721*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1722*da112707SJunchao Zhang 1723*da112707SJunchao Zhang /* Solve Ut*y = b */ 1724*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 1725*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 1726*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1727*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1728*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1729*da112707SJunchao Zhang fs->spMatDescr_U, /* Ut Y = X */ 1730*da112707SJunchao Zhang fs->dnVecDescr_X, 1731*da112707SJunchao Zhang fs->dnVecDescr_Y, 1732*da112707SJunchao Zhang cusparse_scalartype, 1733*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1734*da112707SJunchao Zhang fs->spsvDescr_Ut)); 1735*da112707SJunchao Zhang 1736*da112707SJunchao Zhang /* Solve Lt*x = y */ 1737*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 1738*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 1739*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 1740*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1741*da112707SJunchao Zhang fs->spMatDescr_L, /* Lt X = Y */ 1742*da112707SJunchao Zhang fs->dnVecDescr_Y, 1743*da112707SJunchao Zhang fs->dnVecDescr_X, 1744*da112707SJunchao Zhang cusparse_scalartype, 1745*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1746*da112707SJunchao Zhang fs->spsvDescr_Lt)); 1747*da112707SJunchao Zhang 1748*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 1749*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 1750*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1751*da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 1752*da112707SJunchao Zhang PetscFunctionReturn(0); 1753*da112707SJunchao Zhang } 1754*da112707SJunchao Zhang 1755*da112707SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info) 1756*da112707SJunchao Zhang { 1757*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1758*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1759*da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1760*da112707SJunchao Zhang CsrMatrix *Acsr; 1761*da112707SJunchao Zhang PetscInt m,nz; 1762*da112707SJunchao Zhang PetscBool flg; 1763*da112707SJunchao Zhang 1764*da112707SJunchao Zhang PetscFunctionBegin; 1765*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1766*da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1767*da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1768*da112707SJunchao Zhang } 1769*da112707SJunchao Zhang 1770*da112707SJunchao Zhang /* Copy A's value to fact */ 1771*da112707SJunchao Zhang m = fact->rmap->n; 1772*da112707SJunchao Zhang nz = aij->nz; 1773*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1774*da112707SJunchao Zhang Acsr = (CsrMatrix*)Acusp->mat->mat; 1775*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1776*da112707SJunchao Zhang 1777*da112707SJunchao Zhang /* Factorize fact inplace */ 1778*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1779*da112707SJunchao Zhang fs->matDescr_M, 1780*da112707SJunchao Zhang fs->csrVal, 1781*da112707SJunchao Zhang fs->csrRowPtr, 1782*da112707SJunchao Zhang fs->csrColIdx, 1783*da112707SJunchao Zhang fs->ilu0Info_M, 1784*da112707SJunchao Zhang fs->policy_M, 1785*da112707SJunchao Zhang fs->factBuffer_M)); 1786*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1787*da112707SJunchao Zhang int numerical_zero; 1788*da112707SJunchao Zhang cusparseStatus_t status; 1789*da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1790*da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero); 1791*da112707SJunchao Zhang } 1792*da112707SJunchao Zhang 1793*da112707SJunchao Zhang /* From my experiment, cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() */ 1794*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1795*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1796*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1797*da112707SJunchao Zhang fs->spMatDescr_L, 1798*da112707SJunchao Zhang fs->dnVecDescr_X, 1799*da112707SJunchao Zhang fs->dnVecDescr_Y, 1800*da112707SJunchao Zhang cusparse_scalartype, 1801*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1802*da112707SJunchao Zhang fs->spsvDescr_L, 1803*da112707SJunchao Zhang fs->spsvBuffer_L)); 1804*da112707SJunchao Zhang 1805*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 1806*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1807*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1808*da112707SJunchao Zhang fs->spMatDescr_U, 1809*da112707SJunchao Zhang fs->dnVecDescr_X, 1810*da112707SJunchao Zhang fs->dnVecDescr_Y, 1811*da112707SJunchao Zhang cusparse_scalartype, 1812*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1813*da112707SJunchao Zhang fs->spsvDescr_U, 1814*da112707SJunchao Zhang fs->spsvBuffer_U)); 1815*da112707SJunchao Zhang 1816*da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1817*da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1818*da112707SJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1819*da112707SJunchao Zhang fact->ops->matsolve = NULL; 1820*da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1821*da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1822*da112707SJunchao Zhang PetscFunctionReturn(0); 1823*da112707SJunchao Zhang } 1824*da112707SJunchao Zhang 1825*da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1826*da112707SJunchao Zhang { 1827*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 1828*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 1829*da112707SJunchao Zhang PetscInt m,nz; 1830*da112707SJunchao Zhang 1831*da112707SJunchao Zhang PetscFunctionBegin; 1832*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1833*da112707SJunchao Zhang PetscInt i; 1834*da112707SJunchao Zhang PetscBool flg,missing; 1835*da112707SJunchao Zhang 1836*da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 1837*da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 1838*da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 1839*da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A,&missing,&i)); 1840*da112707SJunchao Zhang PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 1841*da112707SJunchao Zhang } 1842*da112707SJunchao Zhang 1843*da112707SJunchao Zhang /* Free the old stale stuff */ 1844*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1845*da112707SJunchao Zhang 1846*da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1847*da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1848*da112707SJunchao Zhang */ 1849*da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 1850*da112707SJunchao Zhang 1851*da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1852*da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1853*da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1854*da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1855*da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1856*da112707SJunchao Zhang 1857*da112707SJunchao Zhang aij->row = NULL; 1858*da112707SJunchao Zhang aij->col = NULL; 1859*da112707SJunchao Zhang 1860*da112707SJunchao Zhang /* ====================================================================== */ 1861*da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1862*da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1863*da112707SJunchao Zhang /* ====================================================================== */ 1864*da112707SJunchao Zhang const int *Ai,*Aj; 1865*da112707SJunchao Zhang 1866*da112707SJunchao Zhang m = fact->rmap->n; 1867*da112707SJunchao Zhang nz = aij->nz; 1868*da112707SJunchao Zhang 1869*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 1870*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 1871*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 1872*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 1873*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1874*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 1875*da112707SJunchao Zhang 1876*da112707SJunchao Zhang /* ====================================================================== */ 1877*da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1878*da112707SJunchao Zhang /* ====================================================================== */ 1879*da112707SJunchao Zhang cusparseFillMode_t fillMode; 1880*da112707SJunchao Zhang cusparseDiagType_t diagType; 1881*da112707SJunchao Zhang 1882*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1883*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1884*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1885*da112707SJunchao Zhang 1886*da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1887*da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1888*da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1889*da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1890*da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1891*da112707SJunchao Zhang */ 1892*da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1893*da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1894*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 1895*da112707SJunchao Zhang fs->csrRowPtr, 1896*da112707SJunchao Zhang fs->csrColIdx, 1897*da112707SJunchao Zhang fs->csrVal, 1898*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1899*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1900*da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 1901*da112707SJunchao Zhang cusparse_scalartype)); 1902*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1903*da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 1904*da112707SJunchao Zhang &fillMode, 1905*da112707SJunchao Zhang sizeof(fillMode))); 1906*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 1907*da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 1908*da112707SJunchao Zhang &diagType, 1909*da112707SJunchao Zhang sizeof(diagType))); 1910*da112707SJunchao Zhang 1911*da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1912*da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1913*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz, 1914*da112707SJunchao Zhang fs->csrRowPtr, 1915*da112707SJunchao Zhang fs->csrColIdx, 1916*da112707SJunchao Zhang fs->csrVal, 1917*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1918*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 1919*da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 1920*da112707SJunchao Zhang cusparse_scalartype)); 1921*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1922*da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 1923*da112707SJunchao Zhang &fillMode, 1924*da112707SJunchao Zhang sizeof(fillMode))); 1925*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, 1926*da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 1927*da112707SJunchao Zhang &diagType, 1928*da112707SJunchao Zhang sizeof(diagType))); 1929*da112707SJunchao Zhang 1930*da112707SJunchao Zhang /* ========================================================================= */ 1931*da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1932*da112707SJunchao Zhang /* ========================================================================= */ 1933*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1934*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1935*da112707SJunchao Zhang fs->matDescr_M, 1936*da112707SJunchao Zhang fs->csrVal, 1937*da112707SJunchao Zhang fs->csrRowPtr, 1938*da112707SJunchao Zhang fs->csrColIdx, 1939*da112707SJunchao Zhang fs->ilu0Info_M, 1940*da112707SJunchao Zhang &fs->factBufferSize_M)); 1941*da112707SJunchao Zhang 1942*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 1943*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 1944*da112707SJunchao Zhang 1945*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 1946*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 1947*da112707SJunchao Zhang 1948*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1949*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1950*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1951*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1952*da112707SJunchao Zhang fs->spMatDescr_L, 1953*da112707SJunchao Zhang fs->dnVecDescr_X, 1954*da112707SJunchao Zhang fs->dnVecDescr_Y, 1955*da112707SJunchao Zhang cusparse_scalartype, 1956*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1957*da112707SJunchao Zhang fs->spsvDescr_L, 1958*da112707SJunchao Zhang &fs->spsvBufferSize_L)); 1959*da112707SJunchao Zhang 1960*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1961*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 1962*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 1963*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 1964*da112707SJunchao Zhang fs->spMatDescr_U, 1965*da112707SJunchao Zhang fs->dnVecDescr_X, 1966*da112707SJunchao Zhang fs->dnVecDescr_Y, 1967*da112707SJunchao Zhang cusparse_scalartype, 1968*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 1969*da112707SJunchao Zhang fs->spsvDescr_U, 1970*da112707SJunchao Zhang &fs->spsvBufferSize_U)); 1971*da112707SJunchao Zhang 1972*da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1973*da112707SJunchao Zhang spsvBuffer_L and spsvBuffer_U can not be shared. 1974*da112707SJunchao Zhang */ 1975*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U)); 1976*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 1977*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 1978*da112707SJunchao Zhang 1979*da112707SJunchao Zhang /* ========================================================================== */ 1980*da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1981*da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1982*da112707SJunchao Zhang /* ========================================================================== */ 1983*da112707SJunchao Zhang int structural_zero; 1984*da112707SJunchao Zhang cusparseStatus_t status; 1985*da112707SJunchao Zhang 1986*da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1987*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1988*da112707SJunchao Zhang fs->matDescr_M, 1989*da112707SJunchao Zhang fs->csrVal, 1990*da112707SJunchao Zhang fs->csrRowPtr, 1991*da112707SJunchao Zhang fs->csrColIdx, 1992*da112707SJunchao Zhang fs->ilu0Info_M, 1993*da112707SJunchao Zhang fs->policy_M, 1994*da112707SJunchao Zhang fs->factBuffer_M)); 1995*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1996*da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1997*da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1998*da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero); 1999*da112707SJunchao Zhang } 2000*da112707SJunchao Zhang 2001*da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 2002*da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2003*da112707SJunchao Zhang PetscInt *Adiag,nzRow,nzLeft; 2004*da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2005*da112707SJunchao Zhang 2006*da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2007*da112707SJunchao Zhang Ai = Aseq->i; 2008*da112707SJunchao Zhang Adiag = Aseq->diag; 2009*da112707SJunchao Zhang for (PetscInt i=0; i<m; i++) { 2010*da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */ 2011*da112707SJunchao Zhang nzRow = Ai[i+1] - Ai[i]; 2012*da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 2013*da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2014*da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2015*da112707SJunchao Zhang */ 2016*da112707SJunchao Zhang nzLeft = (nzRow-1)/2; 2017*da112707SJunchao Zhang flops += nzLeft*(2.0*nzRow-nzLeft+1); 2018*da112707SJunchao Zhang } 2019*da112707SJunchao Zhang } 2020*da112707SJunchao Zhang fs->numericFactFlops = flops; 2021*da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 2022*da112707SJunchao Zhang PetscFunctionReturn(0); 2023*da112707SJunchao Zhang } 2024*da112707SJunchao Zhang 2025*da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x) 2026*da112707SJunchao Zhang { 2027*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2028*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2029*da112707SJunchao Zhang const PetscScalar *barray; 2030*da112707SJunchao Zhang PetscScalar *xarray; 2031*da112707SJunchao Zhang 2032*da112707SJunchao Zhang PetscFunctionBegin; 2033*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x,&xarray)); 2034*da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b,&barray)); 2035*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 2036*da112707SJunchao Zhang 2037*da112707SJunchao Zhang /* Solve L*y = b */ 2038*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray)); 2039*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y)); 2040*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2041*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2042*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2043*da112707SJunchao Zhang fs->spMatDescr_L, /* L Y = X */ 2044*da112707SJunchao Zhang fs->dnVecDescr_X, 2045*da112707SJunchao Zhang fs->dnVecDescr_Y, 2046*da112707SJunchao Zhang cusparse_scalartype, 2047*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2048*da112707SJunchao Zhang fs->spsvDescr_L)); 2049*da112707SJunchao Zhang 2050*da112707SJunchao Zhang /* Solve Lt*x = y */ 2051*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray)); 2052*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, 2053*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2054*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2055*da112707SJunchao Zhang fs->spMatDescr_L, /* Lt X = Y */ 2056*da112707SJunchao Zhang fs->dnVecDescr_Y, 2057*da112707SJunchao Zhang fs->dnVecDescr_X, 2058*da112707SJunchao Zhang cusparse_scalartype, 2059*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2060*da112707SJunchao Zhang fs->spsvDescr_Lt)); 2061*da112707SJunchao Zhang 2062*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b,&barray)); 2063*da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x,&xarray)); 2064*da112707SJunchao Zhang 2065*da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 2066*da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n)); 2067*da112707SJunchao Zhang PetscFunctionReturn(0); 2068*da112707SJunchao Zhang } 2069*da112707SJunchao Zhang 2070*da112707SJunchao Zhang static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info) 2071*da112707SJunchao Zhang { 2072*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2073*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2074*da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2075*da112707SJunchao Zhang CsrMatrix *Acsr; 2076*da112707SJunchao Zhang PetscInt m,nz; 2077*da112707SJunchao Zhang PetscBool flg; 2078*da112707SJunchao Zhang 2079*da112707SJunchao Zhang PetscFunctionBegin; 2080*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2081*da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2082*da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2083*da112707SJunchao Zhang } 2084*da112707SJunchao Zhang 2085*da112707SJunchao Zhang /* Copy A's value to fact */ 2086*da112707SJunchao Zhang m = fact->rmap->n; 2087*da112707SJunchao Zhang nz = aij->nz; 2088*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2089*da112707SJunchao Zhang Acsr = (CsrMatrix*)Acusp->mat->mat; 2090*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2091*da112707SJunchao Zhang 2092*da112707SJunchao Zhang /* Factorize fact inplace */ 2093*da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 2094*da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 2095*da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 2096*da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 2097*da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 2098*da112707SJunchao Zhang */ 2099*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, 2100*da112707SJunchao Zhang fs->matDescr_M, 2101*da112707SJunchao Zhang fs->csrVal, 2102*da112707SJunchao Zhang fs->csrRowPtr, 2103*da112707SJunchao Zhang fs->csrColIdx, 2104*da112707SJunchao Zhang fs->ic0Info_M, 2105*da112707SJunchao Zhang fs->policy_M, 2106*da112707SJunchao Zhang fs->factBuffer_M)); 2107*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2108*da112707SJunchao Zhang int numerical_zero; 2109*da112707SJunchao Zhang cusparseStatus_t status; 2110*da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 2111*da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero); 2112*da112707SJunchao Zhang } 2113*da112707SJunchao Zhang 2114*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2115*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2116*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2117*da112707SJunchao Zhang fs->spMatDescr_L, 2118*da112707SJunchao Zhang fs->dnVecDescr_X, 2119*da112707SJunchao Zhang fs->dnVecDescr_Y, 2120*da112707SJunchao Zhang cusparse_scalartype, 2121*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2122*da112707SJunchao Zhang fs->spsvDescr_L, 2123*da112707SJunchao Zhang fs->spsvBuffer_L)); 2124*da112707SJunchao Zhang 2125*da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2126*da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2127*da112707SJunchao Zhang */ 2128*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, 2129*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2130*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2131*da112707SJunchao Zhang fs->spMatDescr_L, 2132*da112707SJunchao Zhang fs->dnVecDescr_X, 2133*da112707SJunchao Zhang fs->dnVecDescr_Y, 2134*da112707SJunchao Zhang cusparse_scalartype, 2135*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2136*da112707SJunchao Zhang fs->spsvDescr_Lt, 2137*da112707SJunchao Zhang fs->spsvBuffer_Lt)); 2138*da112707SJunchao Zhang 2139*da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 2140*da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2141*da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2142*da112707SJunchao Zhang fact->ops->matsolve = NULL; 2143*da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 2144*da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 2145*da112707SJunchao Zhang PetscFunctionReturn(0); 2146*da112707SJunchao Zhang } 2147*da112707SJunchao Zhang 2148*da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2149*da112707SJunchao Zhang { 2150*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr; 2151*da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ*)fact->data; 2152*da112707SJunchao Zhang PetscInt m,nz; 2153*da112707SJunchao Zhang 2154*da112707SJunchao Zhang PetscFunctionBegin; 2155*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2156*da112707SJunchao Zhang PetscInt i; 2157*da112707SJunchao Zhang PetscBool flg,missing; 2158*da112707SJunchao Zhang 2159*da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 2160*da112707SJunchao Zhang PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name); 2161*da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n); 2162*da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A,&missing,&i)); 2163*da112707SJunchao Zhang PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i); 2164*da112707SJunchao Zhang } 2165*da112707SJunchao Zhang 2166*da112707SJunchao Zhang /* Free the old stale stuff */ 2167*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2168*da112707SJunchao Zhang 2169*da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2170*da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2171*da112707SJunchao Zhang */ 2172*da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/)); 2173*da112707SJunchao Zhang 2174*da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2175*da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2176*da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2177*da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2178*da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2179*da112707SJunchao Zhang 2180*da112707SJunchao Zhang aij->row = NULL; 2181*da112707SJunchao Zhang aij->col = NULL; 2182*da112707SJunchao Zhang 2183*da112707SJunchao Zhang /* ====================================================================== */ 2184*da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2185*da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2186*da112707SJunchao Zhang /* ====================================================================== */ 2187*da112707SJunchao Zhang const int *Ai,*Aj; 2188*da112707SJunchao Zhang 2189*da112707SJunchao Zhang m = fact->rmap->n; 2190*da112707SJunchao Zhang nz = aij->nz; 2191*da112707SJunchao Zhang 2192*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1))); 2193*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz)); 2194*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz)); 2195*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj)); /* Do not use compressed Ai */ 2196*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2197*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream)); 2198*da112707SJunchao Zhang 2199*da112707SJunchao Zhang /* ====================================================================== */ 2200*da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2201*da112707SJunchao Zhang /* ====================================================================== */ 2202*da112707SJunchao Zhang cusparseFillMode_t fillMode; 2203*da112707SJunchao Zhang cusparseDiagType_t diagType; 2204*da112707SJunchao Zhang 2205*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2206*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2207*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2208*da112707SJunchao Zhang 2209*da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2210*da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2211*da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2212*da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2213*da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2214*da112707SJunchao Zhang */ 2215*da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2216*da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2217*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz, 2218*da112707SJunchao Zhang fs->csrRowPtr, 2219*da112707SJunchao Zhang fs->csrColIdx, 2220*da112707SJunchao Zhang fs->csrVal, 2221*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 2222*da112707SJunchao Zhang CUSPARSE_INDEX_32I, 2223*da112707SJunchao Zhang CUSPARSE_INDEX_BASE_ZERO, 2224*da112707SJunchao Zhang cusparse_scalartype)); 2225*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2226*da112707SJunchao Zhang CUSPARSE_SPMAT_FILL_MODE, 2227*da112707SJunchao Zhang &fillMode, 2228*da112707SJunchao Zhang sizeof(fillMode))); 2229*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, 2230*da112707SJunchao Zhang CUSPARSE_SPMAT_DIAG_TYPE, 2231*da112707SJunchao Zhang &diagType, 2232*da112707SJunchao Zhang sizeof(diagType))); 2233*da112707SJunchao Zhang 2234*da112707SJunchao Zhang /* ========================================================================= */ 2235*da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2236*da112707SJunchao Zhang /* ========================================================================= */ 2237*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2238*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, 2239*da112707SJunchao Zhang fs->matDescr_M, 2240*da112707SJunchao Zhang fs->csrVal, 2241*da112707SJunchao Zhang fs->csrRowPtr, 2242*da112707SJunchao Zhang fs->csrColIdx, 2243*da112707SJunchao Zhang fs->ic0Info_M, 2244*da112707SJunchao Zhang &fs->factBufferSize_M)); 2245*da112707SJunchao Zhang 2246*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m)); 2247*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m)); 2248*da112707SJunchao Zhang 2249*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype)); 2250*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype)); 2251*da112707SJunchao Zhang 2252*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2253*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2254*da112707SJunchao Zhang CUSPARSE_OPERATION_NON_TRANSPOSE, 2255*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2256*da112707SJunchao Zhang fs->spMatDescr_L, 2257*da112707SJunchao Zhang fs->dnVecDescr_X, 2258*da112707SJunchao Zhang fs->dnVecDescr_Y, 2259*da112707SJunchao Zhang cusparse_scalartype, 2260*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2261*da112707SJunchao Zhang fs->spsvDescr_L, 2262*da112707SJunchao Zhang &fs->spsvBufferSize_L)); 2263*da112707SJunchao Zhang 2264*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2265*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, 2266*da112707SJunchao Zhang CUSPARSE_OPERATION_TRANSPOSE, 2267*da112707SJunchao Zhang &PETSC_CUSPARSE_ONE, 2268*da112707SJunchao Zhang fs->spMatDescr_L, 2269*da112707SJunchao Zhang fs->dnVecDescr_X, 2270*da112707SJunchao Zhang fs->dnVecDescr_Y, 2271*da112707SJunchao Zhang cusparse_scalartype, 2272*da112707SJunchao Zhang CUSPARSE_SPSV_ALG_DEFAULT, 2273*da112707SJunchao Zhang fs->spsvDescr_Lt, 2274*da112707SJunchao Zhang &fs->spsvBufferSize_Lt)); 2275*da112707SJunchao Zhang 2276*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,fs->factBufferSize_M)); 2277*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L)); 2278*da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt)); 2279*da112707SJunchao Zhang 2280*da112707SJunchao Zhang /* ========================================================================== */ 2281*da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2282*da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2283*da112707SJunchao Zhang /* ========================================================================== */ 2284*da112707SJunchao Zhang int structural_zero; 2285*da112707SJunchao Zhang cusparseStatus_t status; 2286*da112707SJunchao Zhang 2287*da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2288*da112707SJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, 2289*da112707SJunchao Zhang fs->matDescr_M, 2290*da112707SJunchao Zhang fs->csrVal, 2291*da112707SJunchao Zhang fs->csrRowPtr, 2292*da112707SJunchao Zhang fs->csrColIdx, 2293*da112707SJunchao Zhang fs->ic0Info_M, 2294*da112707SJunchao Zhang fs->policy_M, 2295*da112707SJunchao Zhang fs->factBuffer_M)); 2296*da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2297*da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2298*da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2299*da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero); 2300*da112707SJunchao Zhang } 2301*da112707SJunchao Zhang 2302*da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 2303*da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ*)A->data; 2304*da112707SJunchao Zhang PetscInt nzRow,nzLeft; 2305*da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2306*da112707SJunchao Zhang 2307*da112707SJunchao Zhang Ai = Aseq->i; 2308*da112707SJunchao Zhang for (PetscInt i=0; i<m; i++) { 2309*da112707SJunchao Zhang nzRow = Ai[i+1] - Ai[i]; 2310*da112707SJunchao Zhang if (nzRow > 1) { 2311*da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2312*da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2313*da112707SJunchao Zhang */ 2314*da112707SJunchao Zhang nzLeft = (nzRow-1)/2; 2315*da112707SJunchao Zhang flops += nzLeft*(2.0*nzRow-nzLeft+1); 2316*da112707SJunchao Zhang } 2317*da112707SJunchao Zhang } 2318*da112707SJunchao Zhang fs->numericFactFlops = flops; 2319*da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2320*da112707SJunchao Zhang PetscFunctionReturn(0); 2321*da112707SJunchao Zhang } 2322*da112707SJunchao Zhang #endif 2323*da112707SJunchao Zhang 2324*da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2325*da112707SJunchao Zhang { 2326*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2327*da112707SJunchao Zhang 2328*da112707SJunchao Zhang PetscFunctionBegin; 2329*da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 2330*da112707SJunchao Zhang PetscBool row_identity,col_identity; 2331*da112707SJunchao Zhang PetscCall(ISIdentity(isrow,&row_identity)); 2332*da112707SJunchao Zhang PetscCall(ISIdentity(iscol,&col_identity)); 2333*da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2334*da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info)); 2335*da112707SJunchao Zhang } else 2336*da112707SJunchao Zhang #endif 2337*da112707SJunchao Zhang { 2338*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2339*da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2340*da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2341*da112707SJunchao Zhang } 2342*da112707SJunchao Zhang PetscFunctionReturn(0); 2343*da112707SJunchao Zhang } 2344*da112707SJunchao Zhang 2345*da112707SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2346*da112707SJunchao Zhang { 2347*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2348*da112707SJunchao Zhang 2349*da112707SJunchao Zhang PetscFunctionBegin; 2350*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2351*da112707SJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 2352*da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2353*da112707SJunchao Zhang PetscFunctionReturn(0); 2354*da112707SJunchao Zhang } 2355*da112707SJunchao Zhang 2356*da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2357*da112707SJunchao Zhang { 2358*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2359*da112707SJunchao Zhang 2360*da112707SJunchao Zhang PetscFunctionBegin; 2361*da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 2362*da112707SJunchao Zhang PetscBool perm_identity; 2363*da112707SJunchao Zhang PetscCall(ISIdentity(perm,&perm_identity)); 2364*da112707SJunchao Zhang if (!info->levels && perm_identity) { 2365*da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info)); 2366*da112707SJunchao Zhang } else 2367*da112707SJunchao Zhang #endif 2368*da112707SJunchao Zhang { 2369*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2370*da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info)); 2371*da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2372*da112707SJunchao Zhang } 2373*da112707SJunchao Zhang PetscFunctionReturn(0); 2374*da112707SJunchao Zhang } 2375*da112707SJunchao Zhang 2376*da112707SJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2377*da112707SJunchao Zhang { 2378*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 2379*da112707SJunchao Zhang 2380*da112707SJunchao Zhang PetscFunctionBegin; 2381*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2382*da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info)); 2383*da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2384*da112707SJunchao Zhang PetscFunctionReturn(0); 2385*da112707SJunchao Zhang } 2386*da112707SJunchao Zhang 2387841d4cb1SJunchao Zhang PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 2388841d4cb1SJunchao Zhang { 2389841d4cb1SJunchao Zhang PetscFunctionBegin; 2390841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 2391841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2392841d4cb1SJunchao Zhang } 2393841d4cb1SJunchao Zhang 2394841d4cb1SJunchao Zhang /*MC 2395841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2396841d4cb1SJunchao Zhang on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2397841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2398841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2399841d4cb1SJunchao Zhang CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2400841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2401841d4cb1SJunchao Zhang 2402841d4cb1SJunchao Zhang Level: beginner 2403841d4cb1SJunchao Zhang 2404841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2405841d4cb1SJunchao Zhang M*/ 2406841d4cb1SJunchao Zhang 2407841d4cb1SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 2408841d4cb1SJunchao Zhang { 2409841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2410841d4cb1SJunchao Zhang 2411841d4cb1SJunchao Zhang PetscFunctionBegin; 2412841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 2413841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B,n,n,n,n)); 2414841d4cb1SJunchao Zhang (*B)->factortype = ftype; 2415841d4cb1SJunchao Zhang PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE)); 2416841d4cb1SJunchao Zhang 2417841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE)); 2418841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2419841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 2420841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2421841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2422841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2423841d4cb1SJunchao Zhang } else { 2424841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2425841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2426841d4cb1SJunchao Zhang } 2427841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 2428841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2429841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2430841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2431841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2432841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2433841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2434841d4cb1SJunchao Zhang } else { 2435841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2436841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2437841d4cb1SJunchao Zhang } 2438841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2439841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2440841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 2441841d4cb1SJunchao Zhang 2442841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 2443841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2444841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse)); 2445841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2446841d4cb1SJunchao Zhang } 2447841d4cb1SJunchao Zhang 24487e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 24497e8381f9SStefano Zampini { 24507e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 24517e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2452*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 24537e8381f9SStefano Zampini 24547e8381f9SStefano Zampini PetscFunctionBegin; 24557e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 24569566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 2457*da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2458*da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 24599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2460*da112707SJunchao Zhang } 2461*da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500 2462*da112707SJunchao Zhang else if (fs->csrVal) { 2463*da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2464*da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2465*da112707SJunchao Zhang } 2466*da112707SJunchao Zhang #endif 2467*da112707SJunchao Zhang else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host"); 24689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar))); 24699566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0)); 24707e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 24717e8381f9SStefano Zampini } 24727e8381f9SStefano Zampini PetscFunctionReturn(0); 24737e8381f9SStefano Zampini } 24747e8381f9SStefano Zampini 24757e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 24767e8381f9SStefano Zampini { 24777e8381f9SStefano Zampini PetscFunctionBegin; 24789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 247967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 248067a45760SJunchao Zhang PetscFunctionReturn(0); 248167a45760SJunchao Zhang } 248267a45760SJunchao Zhang 248367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 248467a45760SJunchao Zhang { 248567a45760SJunchao Zhang PetscFunctionBegin; 24867e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 248767a45760SJunchao Zhang *array = NULL; 248867a45760SJunchao Zhang PetscFunctionReturn(0); 248967a45760SJunchao Zhang } 249067a45760SJunchao Zhang 249167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 249267a45760SJunchao Zhang { 249367a45760SJunchao Zhang PetscFunctionBegin; 24949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 249567a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 249667a45760SJunchao Zhang PetscFunctionReturn(0); 249767a45760SJunchao Zhang } 249867a45760SJunchao Zhang 249967a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[]) 250067a45760SJunchao Zhang { 250167a45760SJunchao Zhang PetscFunctionBegin; 250267a45760SJunchao Zhang *array = NULL; 250367a45760SJunchao Zhang PetscFunctionReturn(0); 250467a45760SJunchao Zhang } 250567a45760SJunchao Zhang 250667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 250767a45760SJunchao Zhang { 250867a45760SJunchao Zhang PetscFunctionBegin; 250967a45760SJunchao Zhang *array = ((Mat_SeqAIJ*)A->data)->a; 251067a45760SJunchao Zhang PetscFunctionReturn(0); 251167a45760SJunchao Zhang } 251267a45760SJunchao Zhang 251367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 251467a45760SJunchao Zhang { 251567a45760SJunchao Zhang PetscFunctionBegin; 251667a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 251767a45760SJunchao Zhang *array = NULL; 25187e8381f9SStefano Zampini PetscFunctionReturn(0); 25197e8381f9SStefano Zampini } 25207e8381f9SStefano Zampini 25217ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 25227ee59b9bSJunchao Zhang { 25237ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 25247ee59b9bSJunchao Zhang CsrMatrix *matrix; 25257ee59b9bSJunchao Zhang 25267ee59b9bSJunchao Zhang PetscFunctionBegin; 25277ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 25287ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 25297ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr); 25307ee59b9bSJunchao Zhang PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL"); 25317ee59b9bSJunchao Zhang matrix = (CsrMatrix*)cusp->mat->mat; 25327ee59b9bSJunchao Zhang 25337ee59b9bSJunchao Zhang if (i) { 25347ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 25357ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 25367ee59b9bSJunchao Zhang #else 25377ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 25387ee59b9bSJunchao Zhang #endif 25397ee59b9bSJunchao Zhang } 25407ee59b9bSJunchao Zhang if (j) { 25417ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 25427ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 25437ee59b9bSJunchao Zhang #else 25447ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices"); 25457ee59b9bSJunchao Zhang #endif 25467ee59b9bSJunchao Zhang } 25477ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 25487ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 25497ee59b9bSJunchao Zhang PetscFunctionReturn(0); 25507ee59b9bSJunchao Zhang } 25517ee59b9bSJunchao Zhang 2552042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 25539ae82921SPaul Mullowney { 2554aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 25557c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 25569ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2557213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 2558aa372e3fSPaul Mullowney cusparseStatus_t stat; 2559abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 25609ae82921SPaul Mullowney 25619ae82921SPaul Mullowney PetscFunctionBegin; 256228b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU"); 2563c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2564a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2565a49f1ed0SStefano Zampini CsrMatrix *matrix; 2566afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 256785ba7357SStefano Zampini 256808401ef6SPierre Jolivet PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values"); 25699566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 2570afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 25719566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 25729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar))); 25739566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 25749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 257534d6c7a5SJose E. Roman } else { 2576abb89eb1SStefano Zampini PetscInt nnz; 25779566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0)); 25789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format)); 25799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 25807c700b8dSJunchao Zhang delete cusparsestruct->workVector; 258181902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2582a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2583a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 25849ae82921SPaul Mullowney try { 25859ae82921SPaul Mullowney if (a->compressedrow.use) { 25869ae82921SPaul Mullowney m = a->compressedrow.nrows; 25879ae82921SPaul Mullowney ii = a->compressedrow.i; 25889ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 25899ae82921SPaul Mullowney } else { 2590213423ffSJunchao Zhang m = A->rmap->n; 2591213423ffSJunchao Zhang ii = a->i; 2592e6e9a74fSStefano Zampini ridx = NULL; 25939ae82921SPaul Mullowney } 259408401ef6SPierre Jolivet PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data"); 2595abb89eb1SStefano Zampini if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; } 2596abb89eb1SStefano Zampini else nnz = a->nz; 259708401ef6SPierre Jolivet PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data"); 25989ae82921SPaul Mullowney 259985ba7357SStefano Zampini /* create cusparse matrix */ 2600abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2601aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 26029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 26039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 26049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 26059ae82921SPaul Mullowney 26069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar))); 26079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar))); 26089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 26099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26119566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 26129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2613b06137fdSPaul Mullowney 2614aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2615aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 2616aa372e3fSPaul Mullowney /* set the matrix */ 2617afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 2618afb2bd1cSJunchao Zhang mat->num_rows = m; 2619afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2620abb89eb1SStefano Zampini mat->num_entries = nnz; 2621afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 2622afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 26239ae82921SPaul Mullowney 2624abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2625abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 2626aa372e3fSPaul Mullowney 2627abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2628abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 2629aa372e3fSPaul Mullowney 2630aa372e3fSPaul Mullowney /* assign the pointer */ 2631afb2bd1cSJunchao Zhang matstruct->mat = mat; 2632afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2633afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2634afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 2635afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 2636afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 2637afb2bd1cSJunchao Zhang mat->values->data().get(), 2638afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 26399566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2640afb2bd1cSJunchao Zhang } 2641afb2bd1cSJunchao Zhang #endif 2642aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 2643afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2644afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2645afb2bd1cSJunchao Zhang #else 2646afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 2647afb2bd1cSJunchao Zhang mat->num_rows = m; 2648afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2649abb89eb1SStefano Zampini mat->num_entries = nnz; 2650afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 2651afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 2652aa372e3fSPaul Mullowney 2653abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2654abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j+nnz); 2655aa372e3fSPaul Mullowney 2656abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2657abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a+nnz); 2658aa372e3fSPaul Mullowney 2659aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 26609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2661aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 2662aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2663afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 2664afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 2665afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 2666afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 26679566063dSJacob Faibussowitsch hybMat, 0, partition);PetscCallCUSPARSE(stat); 2668aa372e3fSPaul Mullowney /* assign the pointer */ 2669aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2670aa372e3fSPaul Mullowney 2671afb2bd1cSJunchao Zhang if (mat) { 2672afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 2673afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 2674afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 2675afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 2676087f3262SPaul Mullowney } 2677afb2bd1cSJunchao Zhang #endif 2678087f3262SPaul Mullowney } 2679ca45077fSPaul Mullowney 2680aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2681213423ffSJunchao Zhang if (a->compressedrow.use) { 2682213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2683aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2684aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 2685213423ffSJunchao Zhang tmp = m; 2686213423ffSJunchao Zhang } else { 2687213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2688213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2689213423ffSJunchao Zhang tmp = 0; 2690213423ffSJunchao Zhang } 26919566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar))); 2692aa372e3fSPaul Mullowney 2693aa372e3fSPaul Mullowney /* assign the pointer */ 2694aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 26959ae82921SPaul Mullowney } catch(char *ex) { 269698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 26979ae82921SPaul Mullowney } 26989566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 26999566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0)); 270034d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 270134d6c7a5SJose E. Roman } 2702abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 27039ae82921SPaul Mullowney } 27049ae82921SPaul Mullowney PetscFunctionReturn(0); 27059ae82921SPaul Mullowney } 27069ae82921SPaul Mullowney 2707c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 2708aa372e3fSPaul Mullowney { 2709aa372e3fSPaul Mullowney template <typename Tuple> 2710aa372e3fSPaul Mullowney __host__ __device__ 2711aa372e3fSPaul Mullowney void operator()(Tuple t) 2712aa372e3fSPaul Mullowney { 2713aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2714aa372e3fSPaul Mullowney } 2715aa372e3fSPaul Mullowney }; 2716aa372e3fSPaul Mullowney 27177e8381f9SStefano Zampini struct VecCUDAEquals 27187e8381f9SStefano Zampini { 27197e8381f9SStefano Zampini template <typename Tuple> 27207e8381f9SStefano Zampini __host__ __device__ 27217e8381f9SStefano Zampini void operator()(Tuple t) 27227e8381f9SStefano Zampini { 27237e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 27247e8381f9SStefano Zampini } 27257e8381f9SStefano Zampini }; 27267e8381f9SStefano Zampini 2727e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 2728e6e9a74fSStefano Zampini { 2729e6e9a74fSStefano Zampini template <typename Tuple> 2730e6e9a74fSStefano Zampini __host__ __device__ 2731e6e9a74fSStefano Zampini void operator()(Tuple t) 2732e6e9a74fSStefano Zampini { 2733e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2734e6e9a74fSStefano Zampini } 2735e6e9a74fSStefano Zampini }; 2736e6e9a74fSStefano Zampini 2737afb2bd1cSJunchao Zhang struct MatMatCusparse { 2738ccdfe979SStefano Zampini PetscBool cisdense; 2739ccdfe979SStefano Zampini PetscScalar *Bt; 2740ccdfe979SStefano Zampini Mat X; 2741fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2742fcdce8c4SStefano Zampini PetscLogDouble flops; 2743fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2744b4285af6SJunchao Zhang 2745afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2746fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2747afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2748afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2749afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2750afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 2751b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 2752b4285af6SJunchao Zhang void *dBuffer4; 2753b4285af6SJunchao Zhang void *dBuffer5; 2754b4285af6SJunchao Zhang #endif 2755fcdce8c4SStefano Zampini size_t mmBufferSize; 2756fcdce8c4SStefano Zampini void *mmBuffer; 2757fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2758fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2759afb2bd1cSJunchao Zhang #endif 2760afb2bd1cSJunchao Zhang }; 2761ccdfe979SStefano Zampini 2762ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2763ccdfe979SStefano Zampini { 2764ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2765ccdfe979SStefano Zampini 2766ccdfe979SStefano Zampini PetscFunctionBegin; 27679566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2768fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2769afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 27709566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 27719566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 27729566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 27739566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2774b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 27759566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 27769566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2777b4285af6SJunchao Zhang #endif 27789566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 27799566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2780afb2bd1cSJunchao Zhang #endif 27819566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 27829566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2783ccdfe979SStefano Zampini PetscFunctionReturn(0); 2784ccdfe979SStefano Zampini } 2785ccdfe979SStefano Zampini 2786ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 2787ccdfe979SStefano Zampini 2788ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2789ccdfe979SStefano Zampini { 2790ccdfe979SStefano Zampini Mat_Product *product = C->product; 2791ccdfe979SStefano Zampini Mat A,B; 2792afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 2793ccdfe979SStefano Zampini PetscBool flg,biscuda; 2794ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2795ccdfe979SStefano Zampini cusparseStatus_t stat; 2796ccdfe979SStefano Zampini cusparseOperation_t opA; 2797ccdfe979SStefano Zampini const PetscScalar *barray; 2798ccdfe979SStefano Zampini PetscScalar *carray; 2799ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2800ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2801ccdfe979SStefano Zampini CsrMatrix *csrmat; 2802ccdfe979SStefano Zampini 2803ccdfe979SStefano Zampini PetscFunctionBegin; 2804ccdfe979SStefano Zampini MatCheckProduct(C,1); 280528b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 2806ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 2807ccdfe979SStefano Zampini A = product->A; 2808ccdfe979SStefano Zampini B = product->B; 28099566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 281028b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2811ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2812ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 281328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 28149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2815ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2816ccdfe979SStefano Zampini switch (product->type) { 2817ccdfe979SStefano Zampini case MATPRODUCT_AB: 2818ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2819ccdfe979SStefano Zampini mat = cusp->mat; 2820ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2821ccdfe979SStefano Zampini m = A->rmap->n; 2822ccdfe979SStefano Zampini n = B->cmap->n; 2823ccdfe979SStefano Zampini break; 2824ccdfe979SStefano Zampini case MATPRODUCT_AtB: 28251a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2826e6e9a74fSStefano Zampini mat = cusp->mat; 2827e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2828e6e9a74fSStefano Zampini } else { 28299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2830ccdfe979SStefano Zampini mat = cusp->matTranspose; 2831ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2832e6e9a74fSStefano Zampini } 2833ccdfe979SStefano Zampini m = A->cmap->n; 2834ccdfe979SStefano Zampini n = B->cmap->n; 2835ccdfe979SStefano Zampini break; 2836ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2837ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2838ccdfe979SStefano Zampini mat = cusp->mat; 2839ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2840ccdfe979SStefano Zampini m = A->rmap->n; 2841ccdfe979SStefano Zampini n = B->rmap->n; 2842ccdfe979SStefano Zampini break; 2843ccdfe979SStefano Zampini default: 284498921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 2845ccdfe979SStefano Zampini } 284628b400f6SJacob Faibussowitsch PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 2847ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 2848ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 28499566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda)); 28509566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B)); 28519566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B,&barray)); 2852afb2bd1cSJunchao Zhang 28539566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B,&blda)); 2854c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 28559566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray)); 28569566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X,&clda)); 2857c8378d12SStefano Zampini } else { 28589566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C,&carray)); 28599566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C,&clda)); 2860c8378d12SStefano Zampini } 2861c8378d12SStefano Zampini 28629566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2863afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2864afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2865a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2866afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2867fcdce8c4SStefano Zampini size_t mmBufferSize; 28689566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;} 2869afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 28709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2871afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2872afb2bd1cSJunchao Zhang } 2873c8378d12SStefano Zampini 28749566063dSJacob Faibussowitsch if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;} 2875afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 28769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL)); 2877afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2878afb2bd1cSJunchao Zhang } 2879afb2bd1cSJunchao Zhang 2880afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2881afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2882afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2883afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2884afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2885afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 28869566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat); 2887afb2bd1cSJunchao Zhang } 2888afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2889afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2890afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 28919566063dSJacob Faibussowitsch cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat); 2892fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 28939566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 28949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize)); 2895fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2896fcdce8c4SStefano Zampini } 2897afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2898afb2bd1cSJunchao Zhang } else { 2899afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 29009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get())); 29019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray)); 29029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray)); 2903afb2bd1cSJunchao Zhang } 2904afb2bd1cSJunchao Zhang 2905afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2906afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2907afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2908afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 29099566063dSJacob Faibussowitsch cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat); 2910afb2bd1cSJunchao Zhang #else 2911afb2bd1cSJunchao Zhang PetscInt k; 2912afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2913ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2914ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2915ccdfe979SStefano Zampini cublasStatus_t cerr; 2916ccdfe979SStefano Zampini 29179566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2918ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2919ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2920ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2921ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 29229566063dSJacob Faibussowitsch mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr); 2923ccdfe979SStefano Zampini blda = B->cmap->n; 2924afb2bd1cSJunchao Zhang k = B->cmap->n; 2925afb2bd1cSJunchao Zhang } else { 2926afb2bd1cSJunchao Zhang k = B->rmap->n; 2927ccdfe979SStefano Zampini } 2928ccdfe979SStefano Zampini 2929afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2930ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2931afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2932ccdfe979SStefano Zampini csrmat->values->data().get(), 2933ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2934ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2935ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 29369566063dSJacob Faibussowitsch carray,clda);PetscCallCUSPARSE(stat); 2937afb2bd1cSJunchao Zhang #endif 29389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 29399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries)); 29409566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B,&barray)); 2941ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 29429566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 29439566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE)); 2944ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 29459566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray)); 29469566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE)); 2947ccdfe979SStefano Zampini } else { 29489566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray)); 2949ccdfe979SStefano Zampini } 2950ccdfe979SStefano Zampini if (mmdata->cisdense) { 29519566063dSJacob Faibussowitsch PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C)); 2952ccdfe979SStefano Zampini } 2953ccdfe979SStefano Zampini if (!biscuda) { 29549566063dSJacob Faibussowitsch PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B)); 2955ccdfe979SStefano Zampini } 2956ccdfe979SStefano Zampini PetscFunctionReturn(0); 2957ccdfe979SStefano Zampini } 2958ccdfe979SStefano Zampini 2959ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2960ccdfe979SStefano Zampini { 2961ccdfe979SStefano Zampini Mat_Product *product = C->product; 2962ccdfe979SStefano Zampini Mat A,B; 2963ccdfe979SStefano Zampini PetscInt m,n; 2964ccdfe979SStefano Zampini PetscBool cisdense,flg; 2965ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2966ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2967ccdfe979SStefano Zampini 2968ccdfe979SStefano Zampini PetscFunctionBegin; 2969ccdfe979SStefano Zampini MatCheckProduct(C,1); 297028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 2971ccdfe979SStefano Zampini A = product->A; 2972ccdfe979SStefano Zampini B = product->B; 29739566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 297428b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 2975ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 297608401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 2977ccdfe979SStefano Zampini switch (product->type) { 2978ccdfe979SStefano Zampini case MATPRODUCT_AB: 2979ccdfe979SStefano Zampini m = A->rmap->n; 2980ccdfe979SStefano Zampini n = B->cmap->n; 2981ccdfe979SStefano Zampini break; 2982ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2983ccdfe979SStefano Zampini m = A->cmap->n; 2984ccdfe979SStefano Zampini n = B->cmap->n; 2985ccdfe979SStefano Zampini break; 2986ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2987ccdfe979SStefano Zampini m = A->rmap->n; 2988ccdfe979SStefano Zampini n = B->rmap->n; 2989ccdfe979SStefano Zampini break; 2990ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2991ccdfe979SStefano Zampini m = B->cmap->n; 2992ccdfe979SStefano Zampini n = B->cmap->n; 2993ccdfe979SStefano Zampini break; 2994ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2995ccdfe979SStefano Zampini m = B->rmap->n; 2996ccdfe979SStefano Zampini n = B->rmap->n; 2997ccdfe979SStefano Zampini break; 2998ccdfe979SStefano Zampini default: 299998921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3000ccdfe979SStefano Zampini } 30019566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 3002ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 30039566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense)); 30049566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQDENSECUDA)); 3005ccdfe979SStefano Zampini 3006ccdfe979SStefano Zampini /* product data */ 30079566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3008ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 3009afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 3010afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 3011ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 30129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar))); 3013ccdfe979SStefano Zampini } 3014afb2bd1cSJunchao Zhang #endif 3015ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 3016ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 30179566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X)); 30189566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA)); 3019ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 30209566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n)); 3021ccdfe979SStefano Zampini } else { 30229566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n)); 3023ccdfe979SStefano Zampini } 3024ccdfe979SStefano Zampini } 3025ccdfe979SStefano Zampini C->product->data = mmdata; 3026ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3027ccdfe979SStefano Zampini 3028ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 3029ccdfe979SStefano Zampini PetscFunctionReturn(0); 3030ccdfe979SStefano Zampini } 3031ccdfe979SStefano Zampini 3032fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3033ccdfe979SStefano Zampini { 3034ccdfe979SStefano Zampini Mat_Product *product = C->product; 3035fcdce8c4SStefano Zampini Mat A,B; 3036fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3037fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 3038fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3039fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3040fcdce8c4SStefano Zampini PetscBool flg; 3041fcdce8c4SStefano Zampini cusparseStatus_t stat; 3042fcdce8c4SStefano Zampini MatProductType ptype; 3043fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3044fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3045fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3046fcdce8c4SStefano Zampini #endif 3047b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3048ccdfe979SStefano Zampini 3049ccdfe979SStefano Zampini PetscFunctionBegin; 3050ccdfe979SStefano Zampini MatCheckProduct(C,1); 305128b400f6SJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty"); 30529566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg)); 305328b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name); 3054fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 3055fcdce8c4SStefano Zampini A = product->A; 3056fcdce8c4SStefano Zampini B = product->B; 3057fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 3058fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 3059fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 306008401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3061fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 306228b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 3063fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 306428b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 3065fcdce8c4SStefano Zampini goto finalize; 3066fcdce8c4SStefano Zampini } 3067fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 30689566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 306928b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 30709566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 307128b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 307228b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 307328b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 3074fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3075fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 3076fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 307708401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 307808401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 307908401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 30809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 30819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3082fcdce8c4SStefano Zampini 3083fcdce8c4SStefano Zampini ptype = product->type; 3084fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 3085fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 308628b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric"); 3087fa046f9fSJunchao Zhang } 3088fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 3089fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 309028b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric"); 3091fa046f9fSJunchao Zhang } 3092fcdce8c4SStefano Zampini switch (ptype) { 3093fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3094fcdce8c4SStefano Zampini Amat = Acusp->mat; 3095fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3096fcdce8c4SStefano Zampini break; 3097fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3098fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3099fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3100fcdce8c4SStefano Zampini break; 3101fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3102fcdce8c4SStefano Zampini Amat = Acusp->mat; 3103fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3104fcdce8c4SStefano Zampini break; 3105fcdce8c4SStefano Zampini default: 310698921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3107fcdce8c4SStefano Zampini } 3108fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 310928b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 311028b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 311128b400f6SJacob Faibussowitsch PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 3112fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 3113fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 3114fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 311528b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 311628b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 311728b400f6SJacob Faibussowitsch PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct"); 31189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3119fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3120fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 31219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3122b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3123b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3124b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3125b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 31269566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3127b4285af6SJunchao Zhang #else 3128b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3129fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3130fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 31319566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3132b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3133fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 31349566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3135b4285af6SJunchao Zhang #endif 3136fcdce8c4SStefano Zampini #else 3137b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3138fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3139fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3140fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 31419566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3142fcdce8c4SStefano Zampini #endif 31439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 31449566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 31459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3146fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3147fcdce8c4SStefano Zampini finalize: 3148fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 31499566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 31509566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 31519566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 3152fcdce8c4SStefano Zampini c->reallocs = 0; 3153fcdce8c4SStefano Zampini C->info.mallocs += 0; 3154fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 3155fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 3156fcdce8c4SStefano Zampini C->num_ass++; 3157ccdfe979SStefano Zampini PetscFunctionReturn(0); 3158ccdfe979SStefano Zampini } 3159fcdce8c4SStefano Zampini 3160fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3161fcdce8c4SStefano Zampini { 3162fcdce8c4SStefano Zampini Mat_Product *product = C->product; 3163fcdce8c4SStefano Zampini Mat A,B; 3164fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 3165fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 3166fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 3167fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3168fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 3169fcdce8c4SStefano Zampini PetscBool flg; 3170fcdce8c4SStefano Zampini cusparseStatus_t stat; 3171fcdce8c4SStefano Zampini MatProductType ptype; 3172fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3173fcdce8c4SStefano Zampini PetscLogDouble flops; 3174fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 3175fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3176fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3177fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3178fcdce8c4SStefano Zampini #else 3179fcdce8c4SStefano Zampini int cnz; 3180fcdce8c4SStefano Zampini #endif 3181b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3182fcdce8c4SStefano Zampini 3183fcdce8c4SStefano Zampini PetscFunctionBegin; 3184fcdce8c4SStefano Zampini MatCheckProduct(C,1); 318528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty"); 3186fcdce8c4SStefano Zampini A = product->A; 3187fcdce8c4SStefano Zampini B = product->B; 31889566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg)); 318928b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name); 31909566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg)); 319128b400f6SJacob Faibussowitsch PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name); 3192fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 3193fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 3194fcdce8c4SStefano Zampini /* product data */ 31959566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3196fcdce8c4SStefano Zampini C->product->data = mmdata; 3197fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3198fcdce8c4SStefano Zampini 31999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 32009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3201d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3202d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 320308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 320408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format"); 3205d60bce21SJunchao Zhang 3206fcdce8c4SStefano Zampini ptype = product->type; 3207fa046f9fSJunchao Zhang if (A->symmetric && ptype == MATPRODUCT_AtB) { 3208fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3209fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3210fa046f9fSJunchao Zhang } 3211fa046f9fSJunchao Zhang if (B->symmetric && ptype == MATPRODUCT_ABt) { 3212fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3213fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3214fa046f9fSJunchao Zhang } 3215fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3216fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3217fcdce8c4SStefano Zampini switch (ptype) { 3218fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3219fcdce8c4SStefano Zampini m = A->rmap->n; 3220fcdce8c4SStefano Zampini n = B->cmap->n; 3221fcdce8c4SStefano Zampini k = A->cmap->n; 3222fcdce8c4SStefano Zampini Amat = Acusp->mat; 3223fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3224fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3225fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3226fcdce8c4SStefano Zampini break; 3227fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3228fcdce8c4SStefano Zampini m = A->cmap->n; 3229fcdce8c4SStefano Zampini n = B->cmap->n; 3230fcdce8c4SStefano Zampini k = A->rmap->n; 32319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3232fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3233fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3234fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3235fcdce8c4SStefano Zampini break; 3236fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3237fcdce8c4SStefano Zampini m = A->rmap->n; 3238fcdce8c4SStefano Zampini n = B->rmap->n; 3239fcdce8c4SStefano Zampini k = A->cmap->n; 32409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3241fcdce8c4SStefano Zampini Amat = Acusp->mat; 3242fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3243fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3244fcdce8c4SStefano Zampini break; 3245fcdce8c4SStefano Zampini default: 324698921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]); 3247fcdce8c4SStefano Zampini } 3248fcdce8c4SStefano Zampini 3249fcdce8c4SStefano Zampini /* create cusparse matrix */ 32509566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,m,n)); 32519566063dSJacob Faibussowitsch PetscCall(MatSetType(C,MATSEQAIJCUSPARSE)); 3252fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 3253fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 3254fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3255fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3256fcdce8c4SStefano Zampini 3257fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3258fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3259fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 32609566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex)); 32619566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows)); 3262fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3263fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3264fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 3265fcdce8c4SStefano Zampini } else { 3266fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3267fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3268fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3269fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3270fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3271fcdce8c4SStefano Zampini } 3272fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3273fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3274fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3275fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3276fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3277fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 32789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 32799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 32809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 32819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 32829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 32839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 32849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 32859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 32869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 3287fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3288fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 3289fcdce8c4SStefano Zampini c->nz = 0; 3290fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3291fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3292fcdce8c4SStefano Zampini goto finalizesym; 3293fcdce8c4SStefano Zampini } 3294fcdce8c4SStefano Zampini 329528b400f6SJacob Faibussowitsch PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 329628b400f6SJacob Faibussowitsch PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 3297fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 3298fcdce8c4SStefano Zampini if (!biscompressed) { 3299fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 3300fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3301fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3302fcdce8c4SStefano Zampini #endif 3303fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3304fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 3305fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3306fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3307fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3308fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3309fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3310fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3311fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3312fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3313fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 33149566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 3315fcdce8c4SStefano Zampini } 3316fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3317fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3318fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3319fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 3320fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 3321fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 3322fcdce8c4SStefano Zampini Bcsr->values->data().get(), 3323fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 33249566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 3325fcdce8c4SStefano Zampini } 3326fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3327fcdce8c4SStefano Zampini #endif 3328fcdce8c4SStefano Zampini } 332928b400f6SJacob Faibussowitsch PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct"); 333028b400f6SJacob Faibussowitsch PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct"); 3331fcdce8c4SStefano Zampini /* precompute flops count */ 3332fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3333fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 3334fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3335fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 3336fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 3337fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3338fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 3339fcdce8c4SStefano Zampini } 3340fcdce8c4SStefano Zampini } 3341fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3342fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 3343fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 3344fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 3345fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 3346fcdce8c4SStefano Zampini } 3347fcdce8c4SStefano Zampini } else { /* TODO */ 3348fcdce8c4SStefano Zampini flops = 0.; 3349fcdce8c4SStefano Zampini } 3350fcdce8c4SStefano Zampini 3351fcdce8c4SStefano Zampini mmdata->flops = flops; 33529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3353b4285af6SJunchao Zhang 3354fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3356fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 3357fcdce8c4SStefano Zampini NULL, NULL, NULL, 3358fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 33599566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 33609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3361b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3362b4285af6SJunchao Zhang { 3363b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3364b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3365b4285af6SJunchao Zhang */ 3366b4285af6SJunchao Zhang void* dBuffer1 = NULL; 3367b4285af6SJunchao Zhang void* dBuffer2 = NULL; 3368b4285af6SJunchao Zhang void* dBuffer3 = NULL; 3369b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3370b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3371b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3372b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3373b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3374b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3375b4285af6SJunchao Zhang 3376b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3377b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 3378b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3379b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 33809566063dSJacob Faibussowitsch &bufferSize1, NULL);PetscCallCUSPARSE(stat); 33819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1)); 3382b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 3383b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3384b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 33859566063dSJacob Faibussowitsch &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat); 3386b4285af6SJunchao Zhang 3387b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3388b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3389b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 33909566063dSJacob Faibussowitsch &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat); 33919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2)); 33929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3)); 33939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4)); 3394b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3395b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 33969566063dSJacob Faibussowitsch &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat); 33979566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 33989566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3399b4285af6SJunchao Zhang 3400b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3401b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 34029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3403b4285af6SJunchao Zhang c->nz = (PetscInt) C_nnz1; 3404b4285af6SJunchao Zhang /* allocate matrix C */ 34059566063dSJacob Faibussowitsch Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 34069566063dSJacob Faibussowitsch Ccsr->values = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3407b4285af6SJunchao Zhang /* update matC with the new pointers */ 3408b4285af6SJunchao Zhang stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 34099566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3410b4285af6SJunchao Zhang 3411b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3412b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3413b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34149566063dSJacob Faibussowitsch &bufferSize5, NULL);PetscCallCUSPARSE(stat); 34159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5)); 3416b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, 3417b4285af6SJunchao Zhang CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, 34189566063dSJacob Faibussowitsch &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat); 34199566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 3420b4285af6SJunchao Zhang stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, 3421b4285af6SJunchao Zhang Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3422b4285af6SJunchao Zhang cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34239566063dSJacob Faibussowitsch mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 34249566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024)); 3425b4285af6SJunchao Zhang } 3426ae37ee31SJunchao Zhang #else 3427b4285af6SJunchao Zhang size_t bufSize2; 3428fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 3429b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3430fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3431fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34329566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat); 34339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2)); 3434fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 3435b4285af6SJunchao Zhang stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, 3436fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3437fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34389566063dSJacob Faibussowitsch mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat); 3439fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 3440b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3441fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3442fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34439566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat); 3444fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3445fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3446fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3447fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3448fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 34499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize)); 3450fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 3451b4285af6SJunchao Zhang stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, 3452fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 3453fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 34549566063dSJacob Faibussowitsch mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat); 3455fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 34569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3457fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 34589566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024)); 3459fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 34609566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3461fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 34629566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3463fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 34649566063dSJacob Faibussowitsch Ccsr->values->data().get());PetscCallCUSPARSE(stat); 3465b4285af6SJunchao Zhang stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, 3466fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 34679566063dSJacob Faibussowitsch cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat); 3468ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3469fcdce8c4SStefano Zampini #else 34709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3471b4285af6SJunchao Zhang stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, 3472fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3473fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3474fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 34759566063dSJacob Faibussowitsch Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat); 3476fcdce8c4SStefano Zampini c->nz = cnz; 3477fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 34789566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3479fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 34809566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3481fcdce8c4SStefano Zampini 34829566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3483fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3484fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3485fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3486b4285af6SJunchao Zhang stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, 3487fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 3488fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 3489fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 34909566063dSJacob Faibussowitsch Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat); 3491fcdce8c4SStefano Zampini #endif 34929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 34939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3494fcdce8c4SStefano Zampini finalizesym: 3495fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3496fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3497fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 34989566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 34999566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 3500fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3501fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3502fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3503fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3504fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3505fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3506fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 35079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 35089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3509fcdce8c4SStefano Zampini } else { 3510fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3511fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 35129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 35139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 3514fcdce8c4SStefano Zampini } 3515fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3516fcdce8c4SStefano Zampini PetscInt r = 0; 3517fcdce8c4SStefano Zampini c->i[0] = 0; 3518fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3519fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3520fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3521fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 3522fcdce8c4SStefano Zampini } 3523fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 3524fcdce8c4SStefano Zampini } 35259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 35269566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 35279566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 3528fcdce8c4SStefano Zampini c->maxnz = c->nz; 3529fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3530fcdce8c4SStefano Zampini c->rmax = 0; 3531fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3532fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 3533fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3534fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3535fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 3536fcdce8c4SStefano Zampini } 35379566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 35389566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 3539fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3540fcdce8c4SStefano Zampini 3541fcdce8c4SStefano Zampini C->nonzerostate++; 35429566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 35439566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3544fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3545fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3546fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3547fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3548fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3549abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3550fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3551fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3552fcdce8c4SStefano Zampini } 3553fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3554fcdce8c4SStefano Zampini PetscFunctionReturn(0); 3555fcdce8c4SStefano Zampini } 3556fcdce8c4SStefano Zampini 3557fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3558fcdce8c4SStefano Zampini 3559fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3560fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3561fcdce8c4SStefano Zampini { 3562fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3563fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 3564fcdce8c4SStefano Zampini 3565fcdce8c4SStefano Zampini PetscFunctionBegin; 3566fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 35679566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense)); 3568abb89eb1SStefano Zampini if (!product->A->boundtocpu && !product->B->boundtocpu) { 35699566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp)); 3570fcdce8c4SStefano Zampini } 3571fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3572fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 3573fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 35749566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp)); 3575fcdce8c4SStefano Zampini } 3576fcdce8c4SStefano Zampini } 357765e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 357865e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 357965e4b4d4SStefano Zampini switch (product->type) { 358065e4b4d4SStefano Zampini case MATPRODUCT_AB: 358165e4b4d4SStefano Zampini if (product->api_user) { 3582d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 35839566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3584d0609cedSBarry Smith PetscOptionsEnd(); 358565e4b4d4SStefano Zampini } else { 3586d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 35879566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 3588d0609cedSBarry Smith PetscOptionsEnd(); 358965e4b4d4SStefano Zampini } 359065e4b4d4SStefano Zampini break; 359165e4b4d4SStefano Zampini case MATPRODUCT_AtB: 359265e4b4d4SStefano Zampini if (product->api_user) { 3593d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 35949566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3595d0609cedSBarry Smith PetscOptionsEnd(); 359665e4b4d4SStefano Zampini } else { 3597d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 35989566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 3599d0609cedSBarry Smith PetscOptionsEnd(); 360065e4b4d4SStefano Zampini } 360165e4b4d4SStefano Zampini break; 360265e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 360365e4b4d4SStefano Zampini if (product->api_user) { 3604d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 36059566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3606d0609cedSBarry Smith PetscOptionsEnd(); 360765e4b4d4SStefano Zampini } else { 3608d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 36099566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 3610d0609cedSBarry Smith PetscOptionsEnd(); 361165e4b4d4SStefano Zampini } 361265e4b4d4SStefano Zampini break; 361365e4b4d4SStefano Zampini case MATPRODUCT_RARt: 361465e4b4d4SStefano Zampini if (product->api_user) { 3615d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat"); 36169566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3617d0609cedSBarry Smith PetscOptionsEnd(); 361865e4b4d4SStefano Zampini } else { 3619d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat"); 36209566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL)); 3621d0609cedSBarry Smith PetscOptionsEnd(); 362265e4b4d4SStefano Zampini } 362365e4b4d4SStefano Zampini break; 362465e4b4d4SStefano Zampini case MATPRODUCT_ABC: 362565e4b4d4SStefano Zampini if (product->api_user) { 3626d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat"); 36279566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3628d0609cedSBarry Smith PetscOptionsEnd(); 362965e4b4d4SStefano Zampini } else { 3630d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat"); 36319566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL)); 3632d0609cedSBarry Smith PetscOptionsEnd(); 363365e4b4d4SStefano Zampini } 363465e4b4d4SStefano Zampini break; 363565e4b4d4SStefano Zampini default: 363665e4b4d4SStefano Zampini break; 363765e4b4d4SStefano Zampini } 363865e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 363965e4b4d4SStefano Zampini } 364065e4b4d4SStefano Zampini /* dispatch */ 3641fcdce8c4SStefano Zampini if (isdense) { 3642ccdfe979SStefano Zampini switch (product->type) { 3643ccdfe979SStefano Zampini case MATPRODUCT_AB: 3644ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3645ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3646ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3647ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3648fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 36499566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3650fcdce8c4SStefano Zampini } else { 3651fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3652fcdce8c4SStefano Zampini } 3653fcdce8c4SStefano Zampini break; 3654fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 3655fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3656fcdce8c4SStefano Zampini break; 3657ccdfe979SStefano Zampini default: 3658ccdfe979SStefano Zampini break; 3659ccdfe979SStefano Zampini } 3660fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3661fcdce8c4SStefano Zampini switch (product->type) { 3662fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3663fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3664fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3665fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3666fcdce8c4SStefano Zampini break; 3667fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3668fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3669fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 3670fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3671fcdce8c4SStefano Zampini break; 3672fcdce8c4SStefano Zampini default: 3673fcdce8c4SStefano Zampini break; 3674fcdce8c4SStefano Zampini } 3675fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 36769566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3677fcdce8c4SStefano Zampini } 3678ccdfe979SStefano Zampini PetscFunctionReturn(0); 3679ccdfe979SStefano Zampini } 3680ccdfe979SStefano Zampini 36816fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 36829ae82921SPaul Mullowney { 36839ae82921SPaul Mullowney PetscFunctionBegin; 36849566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE)); 3685e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3686e6e9a74fSStefano Zampini } 3687e6e9a74fSStefano Zampini 3688e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 3689e6e9a74fSStefano Zampini { 3690e6e9a74fSStefano Zampini PetscFunctionBegin; 36919566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE)); 3692e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3693e6e9a74fSStefano Zampini } 3694e6e9a74fSStefano Zampini 3695e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3696e6e9a74fSStefano Zampini { 3697e6e9a74fSStefano Zampini PetscFunctionBegin; 36989566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE)); 3699e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3700e6e9a74fSStefano Zampini } 3701e6e9a74fSStefano Zampini 3702e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3703e6e9a74fSStefano Zampini { 3704e6e9a74fSStefano Zampini PetscFunctionBegin; 37059566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE)); 37069ae82921SPaul Mullowney PetscFunctionReturn(0); 37079ae82921SPaul Mullowney } 37089ae82921SPaul Mullowney 37096fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3710ca45077fSPaul Mullowney { 3711ca45077fSPaul Mullowney PetscFunctionBegin; 37129566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE)); 3713ca45077fSPaul Mullowney PetscFunctionReturn(0); 3714ca45077fSPaul Mullowney } 3715ca45077fSPaul Mullowney 3716a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y) 3717a0e72f99SJunchao Zhang { 3718a0e72f99SJunchao Zhang int i = blockIdx.x*blockDim.x + threadIdx.x; 3719a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3720a0e72f99SJunchao Zhang } 3721a0e72f99SJunchao Zhang 3722afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3723e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 37249ae82921SPaul Mullowney { 37259ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3726aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 37279ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3728e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 3729e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3730e6e9a74fSStefano Zampini PetscBool compressed; 3731afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3732afb2bd1cSJunchao Zhang PetscInt nx,ny; 3733afb2bd1cSJunchao Zhang #endif 37346e111a19SKarl Rupp 37359ae82921SPaul Mullowney PetscFunctionBegin; 373608401ef6SPierre Jolivet PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported"); 3737cbc6b225SStefano Zampini if (!a->nz) { 37389566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz,0)); 37399566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy,zz)); 3740e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3741e6e9a74fSStefano Zampini } 374234d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 37439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3744e6e9a74fSStefano Zampini if (!trans) { 37459ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 37465f80ce2aSJacob Faibussowitsch PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3747e6e9a74fSStefano Zampini } else { 37481a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3749e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3750e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 3751e6e9a74fSStefano Zampini } else { 37529566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3753e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 3754e6e9a74fSStefano Zampini } 3755e6e9a74fSStefano Zampini } 3756e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3757e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3758213423ffSJunchao Zhang 3759e6e9a74fSStefano Zampini try { 37609566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray)); 37619566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 37629566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */ 3763afb2bd1cSJunchao Zhang 37649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3765e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3766afb2bd1cSJunchao Zhang /* z = A x + beta y. 3767afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3768afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3769afb2bd1cSJunchao Zhang */ 3770e6e9a74fSStefano Zampini xptr = xarray; 3771afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3772213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3773afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3774afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3775afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3776afb2bd1cSJunchao Zhang */ 3777afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3778afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3779afb2bd1cSJunchao Zhang nx = mat->num_cols; 3780afb2bd1cSJunchao Zhang ny = mat->num_rows; 3781afb2bd1cSJunchao Zhang } 3782afb2bd1cSJunchao Zhang #endif 3783e6e9a74fSStefano Zampini } else { 3784afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3785afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3786afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3787afb2bd1cSJunchao Zhang */ 3788afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3789e6e9a74fSStefano Zampini dptr = zarray; 3790e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3791afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3792e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3793a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3794e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3795e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 3796e6e9a74fSStefano Zampini } 3797afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3798afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3799afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 3800afb2bd1cSJunchao Zhang nx = mat->num_rows; 3801afb2bd1cSJunchao Zhang ny = mat->num_cols; 3802afb2bd1cSJunchao Zhang } 3803afb2bd1cSJunchao Zhang #endif 3804e6e9a74fSStefano Zampini } 38059ae82921SPaul Mullowney 3806afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3807aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3808afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 38095f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3810afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 38119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype)); 38129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype)); 38139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 3814afb2bd1cSJunchao Zhang matstruct->matDescr, 3815afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 3816afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3817afb2bd1cSJunchao Zhang cusparse_scalartype, 3818afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 38195f80ce2aSJacob Faibussowitsch &matstruct->cuSpMV[opA].spmvBufferSize)); 38209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize)); 3821afb2bd1cSJunchao Zhang 3822afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3823afb2bd1cSJunchao Zhang } else { 3824afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 38259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr)); 38269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr)); 3827afb2bd1cSJunchao Zhang } 3828afb2bd1cSJunchao Zhang 38299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, 3830afb2bd1cSJunchao Zhang matstruct->alpha_one, 38313606e59fSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 3832afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 3833afb2bd1cSJunchao Zhang beta, 3834afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 3835afb2bd1cSJunchao Zhang cusparse_scalartype, 3836afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 38375f80ce2aSJacob Faibussowitsch matstruct->cuSpMV[opA].spmvBuffer)); 3838afb2bd1cSJunchao Zhang #else 38397656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 38409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, 3841a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 3842afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 3843aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 3844e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 38455f80ce2aSJacob Faibussowitsch dptr)); 3846afb2bd1cSJunchao Zhang #endif 3847aa372e3fSPaul Mullowney } else { 3848213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3849afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3850afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3851afb2bd1cSJunchao Zhang #else 3852301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 38539566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, 3854afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 3855e6e9a74fSStefano Zampini xptr, beta, 38565f80ce2aSJacob Faibussowitsch dptr)); 3857afb2bd1cSJunchao Zhang #endif 3858a65300a6SPaul Mullowney } 3859aa372e3fSPaul Mullowney } 38609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3861aa372e3fSPaul Mullowney 3862e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3863213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3864213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 38659566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */ 3866e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 38679566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 38687656d835SStefano Zampini } 3869213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 38709566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz,0)); 38717656d835SStefano Zampini } 38727656d835SStefano Zampini 3873213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3874213423ffSJunchao Zhang if (compressed) { 38759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3876a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3877a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3878a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3879a0e72f99SJunchao Zhang */ 3880a0e72f99SJunchao Zhang #if 0 3881a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3882a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3883a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3884e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3885c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3886a0e72f99SJunchao Zhang #else 3887a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3888a0e72f99SJunchao Zhang ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray); 3889a0e72f99SJunchao Zhang #endif 38909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3891e6e9a74fSStefano Zampini } 3892e6e9a74fSStefano Zampini } else { 3893e6e9a74fSStefano Zampini if (yy && yy != zz) { 38949566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */ 3895e6e9a74fSStefano Zampini } 3896e6e9a74fSStefano Zampini } 38979566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray)); 38989566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray)); 38999566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray)); 39009ae82921SPaul Mullowney } catch(char *ex) { 390198921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 39029ae82921SPaul Mullowney } 3903e6e9a74fSStefano Zampini if (yy) { 39049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz)); 3905e6e9a74fSStefano Zampini } else { 39069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt)); 3907e6e9a74fSStefano Zampini } 39089ae82921SPaul Mullowney PetscFunctionReturn(0); 39099ae82921SPaul Mullowney } 39109ae82921SPaul Mullowney 39116fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 3912ca45077fSPaul Mullowney { 3913ca45077fSPaul Mullowney PetscFunctionBegin; 39149566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE)); 3915ca45077fSPaul Mullowney PetscFunctionReturn(0); 3916ca45077fSPaul Mullowney } 3917ca45077fSPaul Mullowney 39186fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 39199ae82921SPaul Mullowney { 3920042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3921042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 39223fa6b06aSMark Adams 3923042217e8SBarry Smith PetscFunctionBegin; 39249566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 3925042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 3926042217e8SBarry Smith 39279566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 39289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3929042217e8SBarry Smith cusp->deviceMat = NULL; 3930042217e8SBarry Smith } 39319ae82921SPaul Mullowney PetscFunctionReturn(0); 39329ae82921SPaul Mullowney } 39339ae82921SPaul Mullowney 39349ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3935e057df02SPaul Mullowney /*@ 39369ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3937e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3938e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3939e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3940e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3941e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 39429ae82921SPaul Mullowney 3943d083f849SBarry Smith Collective 39449ae82921SPaul Mullowney 39459ae82921SPaul Mullowney Input Parameters: 39469ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 39479ae82921SPaul Mullowney . m - number of rows 39489ae82921SPaul Mullowney . n - number of columns 39499ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 39509ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 39510298fd71SBarry Smith (possibly different for each row) or NULL 39529ae82921SPaul Mullowney 39539ae82921SPaul Mullowney Output Parameter: 39549ae82921SPaul Mullowney . A - the matrix 39559ae82921SPaul Mullowney 39569ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 39579ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 39589ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 39599ae82921SPaul Mullowney 39609ae82921SPaul Mullowney Notes: 39619ae82921SPaul Mullowney If nnz is given then nz is ignored 39629ae82921SPaul Mullowney 39639ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 39649ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 39659ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 39669ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 39679ae82921SPaul Mullowney 39689ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 39690298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 39709ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 39719ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 39729ae82921SPaul Mullowney 39739ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 39749ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 39759ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 39769ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 39779ae82921SPaul Mullowney 39789ae82921SPaul Mullowney Level: intermediate 39799ae82921SPaul Mullowney 3980db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 39819ae82921SPaul Mullowney @*/ 39829ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 39839ae82921SPaul Mullowney { 39849ae82921SPaul Mullowney PetscFunctionBegin; 39859566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 39869566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 39879566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 39889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 39899ae82921SPaul Mullowney PetscFunctionReturn(0); 39909ae82921SPaul Mullowney } 39919ae82921SPaul Mullowney 39926fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 39939ae82921SPaul Mullowney { 39949ae82921SPaul Mullowney PetscFunctionBegin; 39959ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 39969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr)); 39979ae82921SPaul Mullowney } else { 39989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr)); 3999aa372e3fSPaul Mullowney } 40009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 40019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 40029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL)); 40039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 40049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 40059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 40069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 40079566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 40089566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 40099566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL)); 40109566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 40119ae82921SPaul Mullowney PetscFunctionReturn(0); 40129ae82921SPaul Mullowney } 40139ae82921SPaul Mullowney 4014ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 401595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 40169ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 40179ff858a8SKarl Rupp { 40189ff858a8SKarl Rupp PetscFunctionBegin; 40199566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B)); 40209566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B)); 40219ff858a8SKarl Rupp PetscFunctionReturn(0); 40229ff858a8SKarl Rupp } 40239ff858a8SKarl Rupp 4024039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) 402595639643SRichard Tran Mills { 4026a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 4027039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 4028039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 4029039c6fbaSStefano Zampini PetscScalar *ay; 4030039c6fbaSStefano Zampini const PetscScalar *ax; 4031039c6fbaSStefano Zampini CsrMatrix *csry,*csrx; 4032e6e9a74fSStefano Zampini 403395639643SRichard Tran Mills PetscFunctionBegin; 4034a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE*)Y->spptr; 4035a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE*)X->spptr; 4036039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 40379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 40389566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4039a587d139SMark PetscFunctionReturn(0); 404095639643SRichard Tran Mills } 4041039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 40429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 40439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 40445f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 40455f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported"); 4046039c6fbaSStefano Zampini csry = (CsrMatrix*)cy->mat->mat; 4047039c6fbaSStefano Zampini csrx = (CsrMatrix*)cx->mat->mat; 4048039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 4049039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 4050039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin()); 4051039c6fbaSStefano Zampini if (eq) { 4052039c6fbaSStefano Zampini eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin()); 4053039c6fbaSStefano Zampini } 4054039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 4055039c6fbaSStefano Zampini } 4056d2be01edSStefano Zampini /* spgeam is buggy with one column */ 4057d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 4058039c6fbaSStefano Zampini 4059039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 4060039c6fbaSStefano Zampini PetscScalar b = 1.0; 4061039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4062039c6fbaSStefano Zampini size_t bufferSize; 4063039c6fbaSStefano Zampini void *buffer; 4064039c6fbaSStefano Zampini #endif 4065039c6fbaSStefano Zampini 40669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 40679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 40689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 4069039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 40709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n, 4071039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4072039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 40735f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize)); 40749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer,bufferSize)); 40759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 40769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4077039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4078039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 40795f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer)); 40809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 40819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 40829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 4083039c6fbaSStefano Zampini #else 40849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 40859566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n, 4086039c6fbaSStefano Zampini &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(), 4087039c6fbaSStefano Zampini &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(), 40885f80ce2aSJacob Faibussowitsch cy->mat->descr, ay,csry->row_offsets->data().get(),csry->column_indices->data().get())); 40899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 40909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4091039c6fbaSStefano Zampini #endif 40929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 40939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 40949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 40959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4096039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 4097a587d139SMark cublasHandle_t cublasv2handle; 4098a587d139SMark PetscBLASInt one = 1, bnz = 1; 4099039c6fbaSStefano Zampini 41009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax)); 41019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 41029566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 41039566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz,&bnz)); 41049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41059566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one)); 41069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*bnz)); 41079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax)); 41099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 41109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 4111039c6fbaSStefano Zampini } else { 41129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE)); 41139566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y,a,X,str)); 4114a587d139SMark } 411595639643SRichard Tran Mills PetscFunctionReturn(0); 411695639643SRichard Tran Mills } 411795639643SRichard Tran Mills 411833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a) 411933c9ba73SStefano Zampini { 412033c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ*)Y->data; 412133c9ba73SStefano Zampini PetscScalar *ay; 412233c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 412333c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 412433c9ba73SStefano Zampini 412533c9ba73SStefano Zampini PetscFunctionBegin; 41269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay)); 41279566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 41289566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz,&bnz)); 41299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41309566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one)); 41319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 41329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay)); 41349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 413533c9ba73SStefano Zampini PetscFunctionReturn(0); 413633c9ba73SStefano Zampini } 413733c9ba73SStefano Zampini 41383fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 41393fa6b06aSMark Adams { 41407e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 4141a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 41427e8381f9SStefano Zampini 41433fa6b06aSMark Adams PetscFunctionBegin; 41443fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 41453fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 41467e8381f9SStefano Zampini if (spptr->mat) { 41477e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 41487e8381f9SStefano Zampini if (matrix->values) { 41497e8381f9SStefano Zampini both = PETSC_TRUE; 41507e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 41517e8381f9SStefano Zampini } 41527e8381f9SStefano Zampini } 41537e8381f9SStefano Zampini if (spptr->matTranspose) { 41547e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 41557e8381f9SStefano Zampini if (matrix->values) { 41567e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 41577e8381f9SStefano Zampini } 41587e8381f9SStefano Zampini } 41593fa6b06aSMark Adams } 41609566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n])); 41619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 41627e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 4163a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 41643fa6b06aSMark Adams PetscFunctionReturn(0); 41653fa6b06aSMark Adams } 41663fa6b06aSMark Adams 4167a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 4168a587d139SMark { 4169a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4170a587d139SMark 4171a587d139SMark PetscFunctionBegin; 41729a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 41739a14fc28SStefano Zampini A->boundtocpu = flg; 41749a14fc28SStefano Zampini PetscFunctionReturn(0); 41759a14fc28SStefano Zampini } 4176a587d139SMark if (flg) { 41779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4178a587d139SMark 417933c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 4180a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 4181a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4182a587d139SMark A->ops->mult = MatMult_SeqAIJ; 4183a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 4184a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4185a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4186a587d139SMark A->ops->multhermitiantranspose = NULL; 4187a587d139SMark A->ops->multhermitiantransposeadd = NULL; 4188fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 41899566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps))); 41909566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL)); 41919566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL)); 41929566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL)); 41939566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 41949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 41959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL)); 4196a587d139SMark } else { 419733c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 4198a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4199a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4200a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 4201a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4202a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4203a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4204a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4205a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4206fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 420767a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 420867a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 420967a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 421067a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 421167a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 421267a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 42137ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 42147ee59b9bSJunchao Zhang 42159566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 42169566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 42179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 42189566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE)); 42199566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE)); 42209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE)); 4221a587d139SMark } 4222a587d139SMark A->boundtocpu = flg; 4223ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 4224ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 4225ea500dcfSRichard Tran Mills } else { 4226ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 4227ea500dcfSRichard Tran Mills } 4228a587d139SMark PetscFunctionReturn(0); 4229a587d139SMark } 4230a587d139SMark 423149735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 42329ae82921SPaul Mullowney { 423349735bf3SStefano Zampini Mat B; 42349ae82921SPaul Mullowney 42359ae82921SPaul Mullowney PetscFunctionBegin; 42369566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 423749735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 42389566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 423949735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 42409566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 424149735bf3SStefano Zampini } 424249735bf3SStefano Zampini B = *newmat; 424349735bf3SStefano Zampini 42449566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 42459566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype)); 424634136279SStefano Zampini 424749735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 42489ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4249e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 42509566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 42519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 42529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 42531a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4254d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4255ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 4256a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4257a435da06SStefano Zampini #else 4258d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4259a435da06SStefano Zampini #endif 4260d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4261d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4262d8132acaSStefano Zampini #endif 42631a2c6b5cSJunchao Zhang B->spptr = spptr; 42649ae82921SPaul Mullowney } else { 4265e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4266e6e9a74fSStefano Zampini 42679566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 42689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 42699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream)); 4270e6e9a74fSStefano Zampini B->spptr = spptr; 42719ae82921SPaul Mullowney } 4272e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 427349735bf3SStefano Zampini } 4274693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 42759ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 42761a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 42779ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 427895639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4279693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 42802205254eSKarl Rupp 42819566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE)); 42829566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE)); 42839566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4284ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 42859566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 4286ae48a8d0SStefano Zampini #endif 42879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 42889ae82921SPaul Mullowney PetscFunctionReturn(0); 42899ae82921SPaul Mullowney } 42909ae82921SPaul Mullowney 429102fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 429202fe1965SBarry Smith { 429302fe1965SBarry Smith PetscFunctionBegin; 42949566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 42959566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B)); 429602fe1965SBarry Smith PetscFunctionReturn(0); 429702fe1965SBarry Smith } 429802fe1965SBarry Smith 42993ca39a21SBarry Smith /*MC 4300e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4301e057df02SPaul Mullowney 4302e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 43032692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 43042692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 4305e057df02SPaul Mullowney 4306e057df02SPaul Mullowney Options Database Keys: 4307e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 4308aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4309a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4310365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 4311e057df02SPaul Mullowney 4312e057df02SPaul Mullowney Level: beginner 4313e057df02SPaul Mullowney 4314db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4315e057df02SPaul Mullowney M*/ 43167f756511SDominic Meiser 4317bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*); 43180f39cd5aSBarry Smith 43193ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 432042c9c57cSBarry Smith { 432142c9c57cSBarry Smith PetscFunctionBegin; 43229566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band)); 43239566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse)); 43249566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse)); 43259566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse)); 43269566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse)); 4327bddcd29dSMark Adams 432842c9c57cSBarry Smith PetscFunctionReturn(0); 432942c9c57cSBarry Smith } 433029b38603SBarry Smith 4331cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4332cbc6b225SStefano Zampini { 4333cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr; 4334cbc6b225SStefano Zampini 4335cbc6b225SStefano Zampini PetscFunctionBegin; 4336cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4337cbc6b225SStefano Zampini delete cusp->cooPerm; 4338cbc6b225SStefano Zampini delete cusp->cooPerm_a; 4339cbc6b225SStefano Zampini cusp->cooPerm = NULL; 4340cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 4341cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 43429566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 43439566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 4344cbc6b225SStefano Zampini } 4345cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 4346cbc6b225SStefano Zampini PetscFunctionReturn(0); 4347cbc6b225SStefano Zampini } 4348cbc6b225SStefano Zampini 4349470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 43507f756511SDominic Meiser { 43517f756511SDominic Meiser PetscFunctionBegin; 43527f756511SDominic Meiser if (*cusparsestruct) { 43539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format)); 43549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format)); 43557f756511SDominic Meiser delete (*cusparsestruct)->workVector; 435681902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 43577e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 43587e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 4359a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 43609566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 43619566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 43629566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 43639566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 43647f756511SDominic Meiser } 43657f756511SDominic Meiser PetscFunctionReturn(0); 43667f756511SDominic Meiser } 43677f756511SDominic Meiser 43687f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 43697f756511SDominic Meiser { 43707f756511SDominic Meiser PetscFunctionBegin; 43717f756511SDominic Meiser if (*mat) { 43727f756511SDominic Meiser delete (*mat)->values; 43737f756511SDominic Meiser delete (*mat)->column_indices; 43747f756511SDominic Meiser delete (*mat)->row_offsets; 43757f756511SDominic Meiser delete *mat; 43767f756511SDominic Meiser *mat = 0; 43777f756511SDominic Meiser } 43787f756511SDominic Meiser PetscFunctionReturn(0); 43797f756511SDominic Meiser } 43807f756511SDominic Meiser 4381470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 43827f756511SDominic Meiser { 43837f756511SDominic Meiser PetscFunctionBegin; 43847f756511SDominic Meiser if (*trifactor) { 43859566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4386261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 43879566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 43889566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 43899566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4390afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 43919566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4392afb2bd1cSJunchao Zhang #endif 43939566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 43947f756511SDominic Meiser } 43957f756511SDominic Meiser PetscFunctionReturn(0); 43967f756511SDominic Meiser } 43977f756511SDominic Meiser 4398470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 43997f756511SDominic Meiser { 44007f756511SDominic Meiser CsrMatrix *mat; 44017f756511SDominic Meiser 44027f756511SDominic Meiser PetscFunctionBegin; 44037f756511SDominic Meiser if (*matstruct) { 44047f756511SDominic Meiser if ((*matstruct)->mat) { 44057f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 4406afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4407afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4408afb2bd1cSJunchao Zhang #else 44097f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 44109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4411afb2bd1cSJunchao Zhang #endif 44127f756511SDominic Meiser } else { 44137f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 44147f756511SDominic Meiser CsrMatrix_Destroy(&mat); 44157f756511SDominic Meiser } 44167f756511SDominic Meiser } 44179566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 44187f756511SDominic Meiser delete (*matstruct)->cprowIndices; 44199566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 44209566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 44219566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4422afb2bd1cSJunchao Zhang 4423afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 4424afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 44259566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4426afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 4427afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 44289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 44299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 44309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4431afb2bd1cSJunchao Zhang } 4432afb2bd1cSJunchao Zhang } 4433afb2bd1cSJunchao Zhang #endif 44347f756511SDominic Meiser delete *matstruct; 44357e8381f9SStefano Zampini *matstruct = NULL; 44367f756511SDominic Meiser } 44377f756511SDominic Meiser PetscFunctionReturn(0); 44387f756511SDominic Meiser } 44397f756511SDominic Meiser 4440e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors) 44417f756511SDominic Meiser { 4442*da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4443*da112707SJunchao Zhang 44447f756511SDominic Meiser PetscFunctionBegin; 4445*da112707SJunchao Zhang if (fs) { 4446*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4447*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4448*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4449*da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4450*da112707SJunchao Zhang delete fs->rpermIndices; 4451*da112707SJunchao Zhang delete fs->cpermIndices; 4452*da112707SJunchao Zhang delete fs->workVector; 4453*da112707SJunchao Zhang fs->rpermIndices = NULL; 4454*da112707SJunchao Zhang fs->cpermIndices = NULL; 4455*da112707SJunchao Zhang fs->workVector = NULL; 4456*da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4457*da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4458*da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4459*da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 4460*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4461*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 4462*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4463*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4464*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 4465*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->factBuffer_M)); 4466*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4467*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4468*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4469*da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4470*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4471*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4472*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4473*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4474*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4475*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4476*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4477*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4478*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4479*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4480*da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4481*da112707SJunchao Zhang fs->builtSolveTranspose = PETSC_FALSE; 4482*da112707SJunchao Zhang #endif 4483ccdfe979SStefano Zampini } 4484ccdfe979SStefano Zampini PetscFunctionReturn(0); 4485ccdfe979SStefano Zampini } 4486ccdfe979SStefano Zampini 4487ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 4488ccdfe979SStefano Zampini { 4489ccdfe979SStefano Zampini cusparseHandle_t handle; 4490ccdfe979SStefano Zampini 4491ccdfe979SStefano Zampini PetscFunctionBegin; 4492ccdfe979SStefano Zampini if (*trifactors) { 44939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 44947f756511SDominic Meiser if (handle = (*trifactors)->handle) { 44959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroy(handle)); 44967f756511SDominic Meiser } 44979566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 44987f756511SDominic Meiser } 44997f756511SDominic Meiser PetscFunctionReturn(0); 45007f756511SDominic Meiser } 45017e8381f9SStefano Zampini 45027e8381f9SStefano Zampini struct IJCompare 45037e8381f9SStefano Zampini { 45047e8381f9SStefano Zampini __host__ __device__ 45057e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 45067e8381f9SStefano Zampini { 45077e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 45087e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 45097e8381f9SStefano Zampini return false; 45107e8381f9SStefano Zampini } 45117e8381f9SStefano Zampini }; 45127e8381f9SStefano Zampini 45137e8381f9SStefano Zampini struct IJEqual 45147e8381f9SStefano Zampini { 45157e8381f9SStefano Zampini __host__ __device__ 45167e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 45177e8381f9SStefano Zampini { 45187e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 45197e8381f9SStefano Zampini return true; 45207e8381f9SStefano Zampini } 45217e8381f9SStefano Zampini }; 45227e8381f9SStefano Zampini 45237e8381f9SStefano Zampini struct IJDiff 45247e8381f9SStefano Zampini { 45257e8381f9SStefano Zampini __host__ __device__ 45267e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 45277e8381f9SStefano Zampini { 45287e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 45297e8381f9SStefano Zampini } 45307e8381f9SStefano Zampini }; 45317e8381f9SStefano Zampini 45327e8381f9SStefano Zampini struct IJSum 45337e8381f9SStefano Zampini { 45347e8381f9SStefano Zampini __host__ __device__ 45357e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 45367e8381f9SStefano Zampini { 45377e8381f9SStefano Zampini return t1||t2; 45387e8381f9SStefano Zampini } 45397e8381f9SStefano Zampini }; 45407e8381f9SStefano Zampini 45417e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 4542219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4543219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 45447e8381f9SStefano Zampini { 45457e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4546fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4547bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 454808391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 45497e8381f9SStefano Zampini CsrMatrix *matrix; 45507e8381f9SStefano Zampini PetscInt n; 45517e8381f9SStefano Zampini 45527e8381f9SStefano Zampini PetscFunctionBegin; 455328b400f6SJacob Faibussowitsch PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 455428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 45557e8381f9SStefano Zampini if (!cusp->cooPerm) { 45569566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 45579566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 45587e8381f9SStefano Zampini PetscFunctionReturn(0); 45597e8381f9SStefano Zampini } 45607e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 456128b400f6SJacob Faibussowitsch PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4562e61fc153SStefano Zampini if (!v) { 4563e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 4564e61fc153SStefano Zampini goto finalize; 45657e8381f9SStefano Zampini } 4566e61fc153SStefano Zampini n = cusp->cooPerm->size(); 456708391a17SStefano Zampini if (isCudaMem(v)) { 456808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 456908391a17SStefano Zampini } else { 4570e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4571e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 457208391a17SStefano Zampini d_v = cooPerm_v->data(); 45739566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 457408391a17SStefano Zampini } 45759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4576e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4577ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4578bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 457908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4580ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4581ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4582ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4583ddea5d60SJunchao Zhang */ 4584e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 4585e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 4586e61fc153SStefano Zampini delete cooPerm_w; 45877e8381f9SStefano Zampini } else { 4588ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 458908391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 45907e8381f9SStefano Zampini matrix->values->begin())); 459108391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 45927e8381f9SStefano Zampini matrix->values->end())); 4593ddea5d60SJunchao Zhang thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 45947e8381f9SStefano Zampini } 45957e8381f9SStefano Zampini } else { 4596e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 459708391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 4598e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 45997e8381f9SStefano Zampini } else { 460008391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 46017e8381f9SStefano Zampini matrix->values->begin())); 460208391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 46037e8381f9SStefano Zampini matrix->values->end())); 46047e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 46057e8381f9SStefano Zampini } 46067e8381f9SStefano Zampini } 46079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4608e61fc153SStefano Zampini finalize: 4609e61fc153SStefano Zampini delete cooPerm_v; 46107e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46119566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4612fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 46139566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz)); 46149566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n")); 46159566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax)); 4616fcdce8c4SStefano Zampini a->reallocs = 0; 4617fcdce8c4SStefano Zampini A->info.mallocs += 0; 4618fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4619fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4620fcdce8c4SStefano Zampini A->num_ass++; 46217e8381f9SStefano Zampini PetscFunctionReturn(0); 46227e8381f9SStefano Zampini } 46237e8381f9SStefano Zampini 4624a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4625a49f1ed0SStefano Zampini { 4626a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4627a49f1ed0SStefano Zampini 4628a49f1ed0SStefano Zampini PetscFunctionBegin; 4629a49f1ed0SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4630a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4631a49f1ed0SStefano Zampini if (destroy) { 46329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format)); 4633a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4634a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4635a49f1ed0SStefano Zampini } 46361a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 4637a49f1ed0SStefano Zampini PetscFunctionReturn(0); 4638a49f1ed0SStefano Zampini } 4639a49f1ed0SStefano Zampini 46407e8381f9SStefano Zampini #include <thrust/binary_search.h> 4641219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4642219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 46437e8381f9SStefano Zampini { 46447e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 46457e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 46467e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 46477e8381f9SStefano Zampini 46487e8381f9SStefano Zampini PetscFunctionBegin; 46499566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 46509566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 46517e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 46527e8381f9SStefano Zampini if (n != cooPerm_n) { 46537e8381f9SStefano Zampini delete cusp->cooPerm; 46547e8381f9SStefano Zampini delete cusp->cooPerm_a; 46557e8381f9SStefano Zampini cusp->cooPerm = NULL; 46567e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 46577e8381f9SStefano Zampini } 46587e8381f9SStefano Zampini if (n) { 46597e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 46607e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 46617e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 46627e8381f9SStefano Zampini 46637e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 46647e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 46657e8381f9SStefano Zampini 46669566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 46677e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 46687e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 4669ddea5d60SJunchao Zhang 4670ddea5d60SJunchao Zhang /* Ex. 4671ddea5d60SJunchao Zhang n = 6 4672ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4673ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4674ddea5d60SJunchao Zhang */ 46757e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 46767e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 46777e8381f9SStefano Zampini 46789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 46797e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4680ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4681ddea5d60SJunchao Zhang *cusp->cooPerm_a = d_i; /* copy the sorted array */ 46827e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 46837e8381f9SStefano Zampini 4684ddea5d60SJunchao Zhang /* 4685ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4686ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4687ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4688ddea5d60SJunchao Zhang */ 4689ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4690ddea5d60SJunchao Zhang 4691ddea5d60SJunchao Zhang /* 4692ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4693ddea5d60SJunchao Zhang ^ekey 4694ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4695ddea5d60SJunchao Zhang ^nekye 4696ddea5d60SJunchao Zhang */ 46977e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 46987e8381f9SStefano Zampini delete cusp->cooPerm_a; 46997e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4700ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4701ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4702ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4703ddea5d60SJunchao Zhang adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4704ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 47057e8381f9SStefano Zampini w[0] = 0; 4706ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4707ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 47087e8381f9SStefano Zampini } 47097e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4710ddea5d60SJunchao Zhang thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4711ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4712ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 47139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 47147e8381f9SStefano Zampini 47159566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i)); 47167e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 47177e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 47187e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 47199566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n+1,&a->i)); 4720ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 47219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 47227e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4723fcdce8c4SStefano Zampini a->rmax = 0; 47249566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->a)); 47259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz,&a->j)); 47269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 47279566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen)); 47289566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax)); 47297e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 47307e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 47317e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 47327e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4733fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 47347e8381f9SStefano Zampini } 4735fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 47367e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 47379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt))); 47389566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 47397e8381f9SStefano Zampini } else { 47409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A,0,NULL)); 47417e8381f9SStefano Zampini } 47429566063dSJacob Faibussowitsch PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE)); 47437e8381f9SStefano Zampini 47447e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4745e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 47469566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a,a->nz)); 47479566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6)); 47487e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 47499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE)); 47517e8381f9SStefano Zampini PetscFunctionReturn(0); 47527e8381f9SStefano Zampini } 4753ed502f03SStefano Zampini 4754219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 4755219fbbafSJunchao Zhang { 4756219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4757219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4758cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4759219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4760219fbbafSJunchao Zhang 4761219fbbafSJunchao Zhang PetscFunctionBegin; 47629566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 47639566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4764219fbbafSJunchao Zhang if (coo_i) { 47659566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 4766219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4767219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { 4768cbc6b225SStefano Zampini if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;} 4769219fbbafSJunchao Zhang } 4770219fbbafSJunchao Zhang } 4771219fbbafSJunchao Zhang } 4772219fbbafSJunchao Zhang 4773219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 47749566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 4775219fbbafSJunchao Zhang } else { 47769566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 4777cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 47789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4779219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ*>(mat->data); 4780219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr); 47819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount))); 47829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 47839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount))); 47849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice)); 4785219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4786219fbbafSJunchao Zhang } 4787219fbbafSJunchao Zhang PetscFunctionReturn(0); 4788219fbbafSJunchao Zhang } 4789219fbbafSJunchao Zhang 479077804d84SJunchao Zhang __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[]) 4791219fbbafSJunchao Zhang { 4792219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 4793219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4794b6c38306SJunchao Zhang for (; i<nnz; i+= grid_size) { 4795b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4796b6c38306SJunchao Zhang for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]]; 4797b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum; 4798b6c38306SJunchao Zhang } 4799219fbbafSJunchao Zhang } 4800219fbbafSJunchao Zhang 4801219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4802219fbbafSJunchao Zhang { 4803219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ*)A->data; 4804219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE*)A->spptr; 4805219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4806219fbbafSJunchao Zhang PetscMemType memtype; 4807219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4808219fbbafSJunchao Zhang PetscScalar *Aa; 4809219fbbafSJunchao Zhang 4810219fbbafSJunchao Zhang PetscFunctionBegin; 4811219fbbafSJunchao Zhang if (dev->use_extended_coo) { 48129566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 4813219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 48149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar))); 48159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 4816219fbbafSJunchao Zhang } 4817219fbbafSJunchao Zhang 48189566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); 48199566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); 4820219fbbafSJunchao Zhang 4821cbc6b225SStefano Zampini if (Annz) { 4822b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa); 48239566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4824cbc6b225SStefano Zampini } 4825219fbbafSJunchao Zhang 48269566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 48279566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 4828219fbbafSJunchao Zhang 48299566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 4830219fbbafSJunchao Zhang } else { 48319566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode)); 4832219fbbafSJunchao Zhang } 4833219fbbafSJunchao Zhang PetscFunctionReturn(0); 4834219fbbafSJunchao Zhang } 4835219fbbafSJunchao Zhang 48365b7e41feSStefano Zampini /*@C 48375b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 48385b7e41feSStefano Zampini 48395b7e41feSStefano Zampini Not collective 48405b7e41feSStefano Zampini 48415b7e41feSStefano Zampini Input Parameters: 48425b7e41feSStefano Zampini + A - the matrix 48435b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 48445b7e41feSStefano Zampini 48455b7e41feSStefano Zampini Output Parameters: 48465b7e41feSStefano Zampini + ia - the CSR row pointers 48475b7e41feSStefano Zampini - ja - the CSR column indices 48485b7e41feSStefano Zampini 48495b7e41feSStefano Zampini Level: developer 48505b7e41feSStefano Zampini 48515b7e41feSStefano Zampini Notes: 48525b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 48535b7e41feSStefano Zampini 4854db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 48555b7e41feSStefano Zampini @*/ 48565f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j) 48575f101d05SStefano Zampini { 48585f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 48595f101d05SStefano Zampini CsrMatrix *csr; 48605f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 48615f101d05SStefano Zampini 48625f101d05SStefano Zampini PetscFunctionBegin; 48635f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 48645f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 48655f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4866aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 48679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 486828b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 48695f101d05SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 48705f101d05SStefano Zampini if (i) { 48715f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 48725f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 48735f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 48745f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 48759566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 48765f101d05SStefano Zampini } 48775f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 48785f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 48795f101d05SStefano Zampini } 48805f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 48815f101d05SStefano Zampini PetscFunctionReturn(0); 48825f101d05SStefano Zampini } 48835f101d05SStefano Zampini 48845b7e41feSStefano Zampini /*@C 48855b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 48865b7e41feSStefano Zampini 48875b7e41feSStefano Zampini Not collective 48885b7e41feSStefano Zampini 48895b7e41feSStefano Zampini Input Parameters: 48905b7e41feSStefano Zampini + A - the matrix 48915b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 48925b7e41feSStefano Zampini 48935b7e41feSStefano Zampini Output Parameters: 48945b7e41feSStefano Zampini + ia - the CSR row pointers 48955b7e41feSStefano Zampini - ja - the CSR column indices 48965b7e41feSStefano Zampini 48975b7e41feSStefano Zampini Level: developer 48985b7e41feSStefano Zampini 4899db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 49005b7e41feSStefano Zampini @*/ 49015f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j) 49025f101d05SStefano Zampini { 49035f101d05SStefano Zampini PetscFunctionBegin; 49045f101d05SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 49055f101d05SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 49065f101d05SStefano Zampini if (i) *i = NULL; 49075f101d05SStefano Zampini if (j) *j = NULL; 49085f101d05SStefano Zampini PetscFunctionReturn(0); 49095f101d05SStefano Zampini } 49105f101d05SStefano Zampini 49115b7e41feSStefano Zampini /*@C 49125b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 49135b7e41feSStefano Zampini 49145b7e41feSStefano Zampini Not Collective 49155b7e41feSStefano Zampini 49165b7e41feSStefano Zampini Input Parameter: 49175b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 49185b7e41feSStefano Zampini 49195b7e41feSStefano Zampini Output Parameter: 49205b7e41feSStefano Zampini . a - pointer to the device data 49215b7e41feSStefano Zampini 49225b7e41feSStefano Zampini Level: developer 49235b7e41feSStefano Zampini 49245b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 49255b7e41feSStefano Zampini 4926db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 49275b7e41feSStefano Zampini @*/ 4928ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 4929ed502f03SStefano Zampini { 4930ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4931ed502f03SStefano Zampini CsrMatrix *csr; 4932ed502f03SStefano Zampini 4933ed502f03SStefano Zampini PetscFunctionBegin; 4934ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4935ed502f03SStefano Zampini PetscValidPointer(a,2); 4936ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4937aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 49389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 493928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 4940ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 494128b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 4942ed502f03SStefano Zampini *a = csr->values->data().get(); 4943ed502f03SStefano Zampini PetscFunctionReturn(0); 4944ed502f03SStefano Zampini } 4945ed502f03SStefano Zampini 49465b7e41feSStefano Zampini /*@C 49475b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 49485b7e41feSStefano Zampini 49495b7e41feSStefano Zampini Not Collective 49505b7e41feSStefano Zampini 49515b7e41feSStefano Zampini Input Parameter: 49525b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 49535b7e41feSStefano Zampini 49545b7e41feSStefano Zampini Output Parameter: 49555b7e41feSStefano Zampini . a - pointer to the device data 49565b7e41feSStefano Zampini 49575b7e41feSStefano Zampini Level: developer 49585b7e41feSStefano Zampini 4959db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 49605b7e41feSStefano Zampini @*/ 4961ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 4962ed502f03SStefano Zampini { 4963ed502f03SStefano Zampini PetscFunctionBegin; 4964ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4965ed502f03SStefano Zampini PetscValidPointer(a,2); 4966ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4967ed502f03SStefano Zampini *a = NULL; 4968ed502f03SStefano Zampini PetscFunctionReturn(0); 4969ed502f03SStefano Zampini } 4970ed502f03SStefano Zampini 49715b7e41feSStefano Zampini /*@C 49725b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 49735b7e41feSStefano Zampini 49745b7e41feSStefano Zampini Not Collective 49755b7e41feSStefano Zampini 49765b7e41feSStefano Zampini Input Parameter: 49775b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 49785b7e41feSStefano Zampini 49795b7e41feSStefano Zampini Output Parameter: 49805b7e41feSStefano Zampini . a - pointer to the device data 49815b7e41feSStefano Zampini 49825b7e41feSStefano Zampini Level: developer 49835b7e41feSStefano Zampini 49845b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 49855b7e41feSStefano Zampini 4986db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 49875b7e41feSStefano Zampini @*/ 4988039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a) 4989039c6fbaSStefano Zampini { 4990039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 4991039c6fbaSStefano Zampini CsrMatrix *csr; 4992039c6fbaSStefano Zampini 4993039c6fbaSStefano Zampini PetscFunctionBegin; 4994039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 4995039c6fbaSStefano Zampini PetscValidPointer(a,2); 4996039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 4997aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 49989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 499928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5000039c6fbaSStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 500128b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5002039c6fbaSStefano Zampini *a = csr->values->data().get(); 5003039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 50049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5005039c6fbaSStefano Zampini PetscFunctionReturn(0); 5006039c6fbaSStefano Zampini } 50075b7e41feSStefano Zampini /*@C 50085b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 5009039c6fbaSStefano Zampini 50105b7e41feSStefano Zampini Not Collective 50115b7e41feSStefano Zampini 50125b7e41feSStefano Zampini Input Parameter: 50135b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50145b7e41feSStefano Zampini 50155b7e41feSStefano Zampini Output Parameter: 50165b7e41feSStefano Zampini . a - pointer to the device data 50175b7e41feSStefano Zampini 50185b7e41feSStefano Zampini Level: developer 50195b7e41feSStefano Zampini 5020db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 50215b7e41feSStefano Zampini @*/ 5022039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a) 5023039c6fbaSStefano Zampini { 5024039c6fbaSStefano Zampini PetscFunctionBegin; 5025039c6fbaSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5026039c6fbaSStefano Zampini PetscValidPointer(a,2); 5027039c6fbaSStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 50289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 50299566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5030039c6fbaSStefano Zampini *a = NULL; 5031039c6fbaSStefano Zampini PetscFunctionReturn(0); 5032039c6fbaSStefano Zampini } 5033039c6fbaSStefano Zampini 50345b7e41feSStefano Zampini /*@C 50355b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 50365b7e41feSStefano Zampini 50375b7e41feSStefano Zampini Not Collective 50385b7e41feSStefano Zampini 50395b7e41feSStefano Zampini Input Parameter: 50405b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50415b7e41feSStefano Zampini 50425b7e41feSStefano Zampini Output Parameter: 50435b7e41feSStefano Zampini . a - pointer to the device data 50445b7e41feSStefano Zampini 50455b7e41feSStefano Zampini Level: developer 50465b7e41feSStefano Zampini 50475b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 50485b7e41feSStefano Zampini 5049db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 50505b7e41feSStefano Zampini @*/ 5051ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 5052ed502f03SStefano Zampini { 5053ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 5054ed502f03SStefano Zampini CsrMatrix *csr; 5055ed502f03SStefano Zampini 5056ed502f03SStefano Zampini PetscFunctionBegin; 5057ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5058ed502f03SStefano Zampini PetscValidPointer(a,2); 5059ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5060aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 506128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5062ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 506328b400f6SJacob Faibussowitsch PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 5064ed502f03SStefano Zampini *a = csr->values->data().get(); 5065039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 50669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE)); 5067ed502f03SStefano Zampini PetscFunctionReturn(0); 5068ed502f03SStefano Zampini } 5069ed502f03SStefano Zampini 50705b7e41feSStefano Zampini /*@C 50715b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 50725b7e41feSStefano Zampini 50735b7e41feSStefano Zampini Not Collective 50745b7e41feSStefano Zampini 50755b7e41feSStefano Zampini Input Parameter: 50765b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 50775b7e41feSStefano Zampini 50785b7e41feSStefano Zampini Output Parameter: 50795b7e41feSStefano Zampini . a - pointer to the device data 50805b7e41feSStefano Zampini 50815b7e41feSStefano Zampini Level: developer 50825b7e41feSStefano Zampini 5083db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 50845b7e41feSStefano Zampini @*/ 5085ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 5086ed502f03SStefano Zampini { 5087ed502f03SStefano Zampini PetscFunctionBegin; 5088ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5089ed502f03SStefano Zampini PetscValidPointer(a,2); 5090ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 50919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 50929566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 5093ed502f03SStefano Zampini *a = NULL; 5094ed502f03SStefano Zampini PetscFunctionReturn(0); 5095ed502f03SStefano Zampini } 5096ed502f03SStefano Zampini 5097ed502f03SStefano Zampini struct IJCompare4 5098ed502f03SStefano Zampini { 5099ed502f03SStefano Zampini __host__ __device__ 51002ed87e7eSStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 5101ed502f03SStefano Zampini { 5102ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 5103ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 5104ed502f03SStefano Zampini return false; 5105ed502f03SStefano Zampini } 5106ed502f03SStefano Zampini }; 5107ed502f03SStefano Zampini 51088909a122SStefano Zampini struct Shift 51098909a122SStefano Zampini { 5110ed502f03SStefano Zampini int _shift; 5111ed502f03SStefano Zampini 5112ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 5113ed502f03SStefano Zampini __host__ __device__ 5114ed502f03SStefano Zampini inline int operator() (const int &c) 5115ed502f03SStefano Zampini { 5116ed502f03SStefano Zampini return c + _shift; 5117ed502f03SStefano Zampini } 5118ed502f03SStefano Zampini }; 5119ed502f03SStefano Zampini 5120ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 5121ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 5122ed502f03SStefano Zampini { 5123ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 5124ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 5125ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 5126ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 5127ed502f03SStefano Zampini PetscInt Annz,Bnnz; 5128ed502f03SStefano Zampini cusparseStatus_t stat; 5129ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 5130ed502f03SStefano Zampini 5131ed502f03SStefano Zampini PetscFunctionBegin; 5132ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 5133ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 5134ed502f03SStefano Zampini PetscValidPointer(C,4); 5135ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 5136ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 51375f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 513808401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 5139aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5140aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 5141ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 5142ed502f03SStefano Zampini m = A->rmap->n; 5143ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 51449566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,C)); 51459566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C,m,n,m,n)); 51469566063dSJacob Faibussowitsch PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE)); 5147ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 5148ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 5149ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 5150ed502f03SStefano Zampini Ccsr = new CsrMatrix; 5151ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 5152ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 5153ed502f03SStefano Zampini c->compressedrow.nrows = 0; 5154ed502f03SStefano Zampini c->compressedrow.i = NULL; 5155ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 5156ed502f03SStefano Zampini Ccusp->workVector = NULL; 5157ed502f03SStefano Zampini Ccusp->nrows = m; 5158ed502f03SStefano Zampini Ccusp->mat = Cmat; 5159ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 5160ed502f03SStefano Zampini Ccsr->num_rows = m; 5161ed502f03SStefano Zampini Ccsr->num_cols = n; 51629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 51639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 51649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 51659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar))); 51669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar))); 51679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 51689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 51699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 51709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 51719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 51729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 517328b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 517428b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5175ed502f03SStefano Zampini 5176ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 5177ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5178ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 5179ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 5180ed502f03SStefano Zampini c->nz = Annz + Bnnz; 5181ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 5182ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 5183ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 5184ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 5185ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 5186ed502f03SStefano Zampini if (c->nz) { 51872ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 51882ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 51892ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 51902ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff,*Broff; 51912ed87e7eSStefano Zampini 5192ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 5193ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 5194ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 5195ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 51969566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt))); 5197ed502f03SStefano Zampini } 51982ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 51992ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 5200ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 5201ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 5202ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 5203ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 52049566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt))); 5205ed502f03SStefano Zampini } 52062ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 52072ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 52089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 52092ed87e7eSStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 52102ed87e7eSStefano Zampini Aroff->data().get(), 52112ed87e7eSStefano Zampini Annz, 52122ed87e7eSStefano Zampini m, 52132ed87e7eSStefano Zampini Acoo->data().get(), 52149566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 5215ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 52162ed87e7eSStefano Zampini Broff->data().get(), 5217ed502f03SStefano Zampini Bnnz, 5218ed502f03SStefano Zampini m, 52192ed87e7eSStefano Zampini Bcoo->data().get(), 52209566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 52212ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 52222ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 52232ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 52248909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0) 5225ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 5226ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 52278909a122SStefano Zampini #else 52288909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 52298909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 52308909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 52318909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n)); 52328909a122SStefano Zampini #endif 52332ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz); 52342ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 52352ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 52362ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm)); 52372ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm)); 52382ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin())); 5239ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 5240ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 5241ed502f03SStefano Zampini thrust::advance(p2,Annz); 52422ed87e7eSStefano Zampini PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4())); 52438909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0) 52448909a122SStefano Zampini thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n)); 52458909a122SStefano Zampini #endif 52462ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 52472ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 52482ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 52492ed87e7eSStefano Zampini PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 52502ed87e7eSStefano Zampini #else 52512ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 52522ed87e7eSStefano Zampini PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred)); 52532ed87e7eSStefano Zampini PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred)); 52542ed87e7eSStefano Zampini #endif 5255ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 52562ed87e7eSStefano Zampini Ccoo->data().get(), 5257ed502f03SStefano Zampini c->nz, 5258ed502f03SStefano Zampini m, 5259ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 52609566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat); 52619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 52622ed87e7eSStefano Zampini delete wPerm; 52632ed87e7eSStefano Zampini delete Acoo; 52642ed87e7eSStefano Zampini delete Bcoo; 52652ed87e7eSStefano Zampini delete Ccoo; 5266ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5267ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 5268ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 5269ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 52709566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5271ed502f03SStefano Zampini #endif 52721a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 52739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 52749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5275ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5276ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5277ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 5278ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5279ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5280ed502f03SStefano Zampini 52811a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 52821a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5283a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 5284ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 5285ed502f03SStefano Zampini CmatT->mat = CcsrT; 5286ed502f03SStefano Zampini CcsrT->num_rows = n; 5287ed502f03SStefano Zampini CcsrT->num_cols = m; 5288ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 5289ed502f03SStefano Zampini 5290ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 5291ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5292ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 5293ed502f03SStefano Zampini 52949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5295ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 5296ed502f03SStefano Zampini if (AT) { 5297ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 5298ed502f03SStefano Zampini thrust::advance(rT,-1); 5299ed502f03SStefano Zampini } 5300ed502f03SStefano Zampini if (BT) { 5301ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 5302ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 5303ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 5304ed502f03SStefano Zampini } 5305ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 5306ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 5307ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 5308ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5309ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5310ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 53119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5312ed502f03SStefano Zampini 53139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 53149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 53159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 53169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar))); 53179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar))); 53189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 53199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 53209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice)); 53219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice)); 5322ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 5323ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 5324ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 5325ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 53269566063dSJacob Faibussowitsch CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat); 5327ed502f03SStefano Zampini #endif 5328ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 5329ed502f03SStefano Zampini } 5330ed502f03SStefano Zampini } 5331ed502f03SStefano Zampini 5332ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 5333ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 5334ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 53359566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m+1,&c->i)); 53369566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->j)); 5337ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 5338ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5339ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5340ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 5341ed502f03SStefano Zampini jj = *Ccsr->column_indices; 53429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 53439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5344ed502f03SStefano Zampini } else { 53459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 53469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 5347ed502f03SStefano Zampini } 53489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt))); 53499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->ilen)); 53509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&c->imax)); 5351ed502f03SStefano Zampini c->maxnz = c->nz; 5352ed502f03SStefano Zampini c->nonzerorowcnt = 0; 5353ed502f03SStefano Zampini c->rmax = 0; 5354ed502f03SStefano Zampini for (i = 0; i < m; i++) { 5355ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 5356ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 5357ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 5358ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 5359ed502f03SStefano Zampini } 53609566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 53619566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz,&c->a)); 5362ed502f03SStefano Zampini (*C)->nonzerostate++; 53639566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 53649566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 5365ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 5366ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 5367ed502f03SStefano Zampini } else { 536808401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n); 5369ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 5370ed502f03SStefano Zampini if (c->nz) { 5371ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 53725f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 5373aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 537408401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 53759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 53769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 53775f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 53785f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 5379ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 5380ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 5381ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 5382aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size()); 5383aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 5384aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 5385aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 53865f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 5387ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 5388ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 53899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5390ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 5391ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 5392ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 5393ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5394ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 5395ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 5396ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 5397ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 5398ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 5399ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 54009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE)); 54011a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 54025f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5403ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5404ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 5405ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 5406ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 5407ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5408ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 5409ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 54101a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5411ed502f03SStefano Zampini } 54129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5413ed502f03SStefano Zampini } 5414ed502f03SStefano Zampini } 54159566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5416ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 5417ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 5418ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5419ed502f03SStefano Zampini PetscFunctionReturn(0); 5420ed502f03SStefano Zampini } 5421c215019aSStefano Zampini 5422c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5423c215019aSStefano Zampini { 5424c215019aSStefano Zampini bool dmem; 5425c215019aSStefano Zampini const PetscScalar *av; 5426c215019aSStefano Zampini 5427c215019aSStefano Zampini PetscFunctionBegin; 5428c215019aSStefano Zampini dmem = isCudaMem(v); 54299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av)); 5430c215019aSStefano Zampini if (n && idx) { 5431c215019aSStefano Zampini THRUSTINTARRAY widx(n); 5432c215019aSStefano Zampini widx.assign(idx,idx+n); 54339566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt))); 5434c215019aSStefano Zampini 5435c215019aSStefano Zampini THRUSTARRAY *w = NULL; 5436c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 5437c215019aSStefano Zampini if (dmem) { 5438c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 5439c215019aSStefano Zampini } else { 5440c215019aSStefano Zampini w = new THRUSTARRAY(n); 5441c215019aSStefano Zampini dv = w->data(); 5442c215019aSStefano Zampini } 5443c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5444c215019aSStefano Zampini 5445c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv)); 5446c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n)); 5447c215019aSStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5448c215019aSStefano Zampini if (w) { 54499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost)); 5450c215019aSStefano Zampini } 5451c215019aSStefano Zampini delete w; 5452c215019aSStefano Zampini } else { 54539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5454c215019aSStefano Zampini } 54559566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 54569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av)); 5457c215019aSStefano Zampini PetscFunctionReturn(0); 5458c215019aSStefano Zampini } 5459