19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 17d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 18d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 19a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 20d0967f54SJacob Faibussowitsch #endif 21a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 22a2cee5feSJed Brown #include <thrust/remove.h> 23a2cee5feSJed Brown #include <thrust/sort.h> 24a2cee5feSJed Brown #include <thrust/unique.h> 25e8d2b73aSMark Adams 26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30afb2bd1cSJunchao Zhang 31afb2bd1cSJunchao Zhang typedef enum { 32afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 35afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 36afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 37afb2bd1cSJunchao Zhang 38afb2bd1cSJunchao Zhang typedef enum { 39afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 47afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 48afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 49afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 50afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 51afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 52afb2bd1cSJunchao Zhang 53afb2bd1cSJunchao Zhang typedef enum { 54afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 55afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 56afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 57afb2bd1cSJunchao Zhang */ 58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61afb2bd1cSJunchao Zhang #endif 629ae82921SPaul Mullowney 63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 686fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 696fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 70087f3262SPaul Mullowney 716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 726fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 746fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 75dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 76a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 796fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 806fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 816fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 84e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 859ae82921SPaul Mullowney 867f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 90470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 917f756511SDominic Meiser 9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9457181aedSStefano Zampini 95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 98c215019aSStefano Zampini 99*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 100*d71ae5a4SJacob Faibussowitsch { 101aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1026e111a19SKarl Rupp 103ca45077fSPaul Mullowney PetscFunctionBegin; 104ca45077fSPaul Mullowney switch (op) { 105*d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT: 106*d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 107*d71ae5a4SJacob Faibussowitsch break; 108*d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_ALL: 109*d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 110*d71ae5a4SJacob Faibussowitsch break; 111*d71ae5a4SJacob Faibussowitsch default: 112*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 113ca45077fSPaul Mullowney } 114ca45077fSPaul Mullowney PetscFunctionReturn(0); 115ca45077fSPaul Mullowney } 1169ae82921SPaul Mullowney 117e057df02SPaul Mullowney /*@ 11811a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 11911a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 12011a5261eSBarry Smith 121e057df02SPaul Mullowney Not Collective 122e057df02SPaul Mullowney 123e057df02SPaul Mullowney Input Parameters: 12411a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 12511a5261eSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`, 12611a5261eSBarry Smith `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 12711a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 128e057df02SPaul Mullowney 129e057df02SPaul Mullowney Output Parameter: 130e057df02SPaul Mullowney 131e057df02SPaul Mullowney Level: intermediate 132e057df02SPaul Mullowney 13311a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 134e057df02SPaul Mullowney @*/ 135*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 136*d71ae5a4SJacob Faibussowitsch { 137e057df02SPaul Mullowney PetscFunctionBegin; 138e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 139cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 140e057df02SPaul Mullowney PetscFunctionReturn(0); 141e057df02SPaul Mullowney } 142e057df02SPaul Mullowney 143*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 144*d71ae5a4SJacob Faibussowitsch { 145365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 146365b711fSMark Adams 147365b711fSMark Adams PetscFunctionBegin; 148365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 149365b711fSMark Adams PetscFunctionReturn(0); 150365b711fSMark Adams } 151365b711fSMark Adams 152365b711fSMark Adams /*@ 15311a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 154365b711fSMark Adams 155365b711fSMark Adams Input Parameters: 15611a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 15711a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 158365b711fSMark Adams 159365b711fSMark Adams Output Parameter: 160365b711fSMark Adams 16111a5261eSBarry Smith Note: 162365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 163365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 164365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 165365b711fSMark Adams 166365b711fSMark Adams Level: intermediate 167365b711fSMark Adams 16811a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 169365b711fSMark Adams @*/ 170*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 171*d71ae5a4SJacob Faibussowitsch { 172365b711fSMark Adams PetscFunctionBegin; 173365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 174cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 175365b711fSMark Adams PetscFunctionReturn(0); 176365b711fSMark Adams } 177365b711fSMark Adams 178*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 179*d71ae5a4SJacob Faibussowitsch { 180e6e9a74fSStefano Zampini PetscFunctionBegin; 1811a2c6b5cSJunchao Zhang switch (op) { 1821a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1831a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1849566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1851a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1861a2c6b5cSJunchao Zhang break; 187*d71ae5a4SJacob Faibussowitsch default: 188*d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 189*d71ae5a4SJacob Faibussowitsch break; 190e6e9a74fSStefano Zampini } 191e6e9a74fSStefano Zampini PetscFunctionReturn(0); 192e6e9a74fSStefano Zampini } 193e6e9a74fSStefano Zampini 194bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 195bddcd29dSMark Adams 196*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 197*d71ae5a4SJacob Faibussowitsch { 198bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 199bddcd29dSMark Adams IS isrow = b->row, iscol = b->col; 200bddcd29dSMark Adams PetscBool row_identity, col_identity; 201365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 202bddcd29dSMark Adams 203bddcd29dSMark Adams PetscFunctionBegin; 2049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2059566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 206bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 207bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 2089566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 2099566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 210f93f8571SJunchao Zhang 211365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 212f93f8571SJunchao Zhang if (row_identity && col_identity) { 213bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 214bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 215bddcd29dSMark Adams } else { 216bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 217bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 218365b711fSMark Adams } 219f93f8571SJunchao Zhang } 220bddcd29dSMark Adams B->ops->matsolve = NULL; 221bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 222bddcd29dSMark Adams 223bddcd29dSMark Adams /* get the triangular factors */ 22448a46eb9SPierre Jolivet if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 225bddcd29dSMark Adams PetscFunctionReturn(0); 226bddcd29dSMark Adams } 227bddcd29dSMark Adams 228*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 229*d71ae5a4SJacob Faibussowitsch { 230e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2319ae82921SPaul Mullowney PetscBool flg; 232a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2336e111a19SKarl Rupp 2349ae82921SPaul Mullowney PetscFunctionBegin; 235d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 2369ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 2379371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2389566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 239afb2bd1cSJunchao Zhang 2409371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2419566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2429566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2439566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 244afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2459371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 246afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 247ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 248aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 249a435da06SStefano Zampini #else 250aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 251a435da06SStefano Zampini #endif 2529371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 253aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 254afb2bd1cSJunchao Zhang 2559371c9d4SSatish Balay PetscCall( 2569371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 257aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 258afb2bd1cSJunchao Zhang #endif 2594c87dfd4SPaul Mullowney } 260d0609cedSBarry Smith PetscOptionsHeadEnd(); 2619ae82921SPaul Mullowney PetscFunctionReturn(0); 2629ae82921SPaul Mullowney } 2639ae82921SPaul Mullowney 264*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 265*d71ae5a4SJacob Faibussowitsch { 2669ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2679ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 269aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 2709ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 2719ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 2729ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2739ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 2749ae82921SPaul Mullowney 2759ae82921SPaul Mullowney PetscFunctionBegin; 276cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 277c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2789ae82921SPaul Mullowney try { 2799ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2809ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 281da79fbbcSStefano Zampini if (!loTriFactor) { 2822cbc15d9SMark PetscScalar *AALo; 2832cbc15d9SMark 2849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 2859ae82921SPaul Mullowney 2869ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 2889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 2899ae82921SPaul Mullowney 2909ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2919ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 2929ae82921SPaul Mullowney AiLo[n] = nzLower; 2939ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 2949ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 2959ae82921SPaul Mullowney v = aa; 2969ae82921SPaul Mullowney vi = aj; 2979ae82921SPaul Mullowney offset = 1; 2989ae82921SPaul Mullowney rowOffset = 1; 2999ae82921SPaul Mullowney for (i = 1; i < n; i++) { 3009ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 301e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3029ae82921SPaul Mullowney AiLo[i] = rowOffset; 3039ae82921SPaul Mullowney rowOffset += nz + 1; 3049ae82921SPaul Mullowney 3059566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 3069566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 3079ae82921SPaul Mullowney 3089ae82921SPaul Mullowney offset += nz; 3099ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 3109ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 3119ae82921SPaul Mullowney offset += 1; 3129ae82921SPaul Mullowney 3139ae82921SPaul Mullowney v += nz; 3149ae82921SPaul Mullowney vi += nz; 3159ae82921SPaul Mullowney } 3162205254eSKarl Rupp 317aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3189566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 319da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 320aa372e3fSPaul Mullowney /* Create the matrix description */ 3219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 325afb2bd1cSJunchao Zhang #else 3269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 327afb2bd1cSJunchao Zhang #endif 3289566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 330aa372e3fSPaul Mullowney 331aa372e3fSPaul Mullowney /* set the operation */ 332aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 333aa372e3fSPaul Mullowney 334aa372e3fSPaul Mullowney /* set the matrix */ 335aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 336aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 337aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 338aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 339aa372e3fSPaul Mullowney 340aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 341aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 342aa372e3fSPaul Mullowney 343aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 344aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 345aa372e3fSPaul Mullowney 346aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 347aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 348aa372e3fSPaul Mullowney 349afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3509566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 351261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3539371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3549371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 3559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 356afb2bd1cSJunchao Zhang #endif 357afb2bd1cSJunchao Zhang 358aa372e3fSPaul Mullowney /* perform the solve analysis */ 3599371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3609371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 3611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3629371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 363d49cd2b7SBarry Smith #else 3645f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 365afb2bd1cSJunchao Zhang #endif 3669566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 3679566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 368aa372e3fSPaul Mullowney 369da79fbbcSStefano Zampini /* assign the pointer */ 370aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 3712cbc15d9SMark loTriFactor->AA_h = AALo; 3729566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 3739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 3749566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 375da79fbbcSStefano Zampini } else { /* update values only */ 37648a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 377da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 3782cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 379da79fbbcSStefano Zampini v = aa; 380da79fbbcSStefano Zampini vi = aj; 381da79fbbcSStefano Zampini offset = 1; 382da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 383da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 3849566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 385da79fbbcSStefano Zampini offset += nz; 3862cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 387da79fbbcSStefano Zampini offset += 1; 388da79fbbcSStefano Zampini v += nz; 389da79fbbcSStefano Zampini } 3902cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 3919566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 392da79fbbcSStefano Zampini } 393*d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 394*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 395*d71ae5a4SJacob Faibussowitsch } 3969ae82921SPaul Mullowney } 3979ae82921SPaul Mullowney PetscFunctionReturn(0); 3989ae82921SPaul Mullowney } 3999ae82921SPaul Mullowney 400*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 401*d71ae5a4SJacob Faibussowitsch { 4029ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4039ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4049ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 405aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 4069ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 4079ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 4089ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4099ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 4109ae82921SPaul Mullowney 4119ae82921SPaul Mullowney PetscFunctionBegin; 412cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 413c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4149ae82921SPaul Mullowney try { 4159ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4169ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 417da79fbbcSStefano Zampini if (!upTriFactor) { 4182cbc15d9SMark PetscScalar *AAUp; 4192cbc15d9SMark 4209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4212cbc15d9SMark 4229ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4259ae82921SPaul Mullowney 4269ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4279ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4289ae82921SPaul Mullowney AiUp[n] = nzUpper; 4299ae82921SPaul Mullowney offset = nzUpper; 4309ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4319ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4329ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4339ae82921SPaul Mullowney 434e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4359ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4369ae82921SPaul Mullowney 437e057df02SPaul Mullowney /* decrement the offset */ 4389ae82921SPaul Mullowney offset -= (nz + 1); 4399ae82921SPaul Mullowney 440e057df02SPaul Mullowney /* first, set the diagonal elements */ 4419ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 44209f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4439ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4449ae82921SPaul Mullowney 4459566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 4469566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 4479ae82921SPaul Mullowney } 4482205254eSKarl Rupp 449aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4509566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 451da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4522205254eSKarl Rupp 453aa372e3fSPaul Mullowney /* Create the matrix description */ 4549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 458afb2bd1cSJunchao Zhang #else 4599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 460afb2bd1cSJunchao Zhang #endif 4619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 463aa372e3fSPaul Mullowney 464aa372e3fSPaul Mullowney /* set the operation */ 465aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 466aa372e3fSPaul Mullowney 467aa372e3fSPaul Mullowney /* set the matrix */ 468aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 469aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 470aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 471aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 472aa372e3fSPaul Mullowney 473aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 474aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 475aa372e3fSPaul Mullowney 476aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 477aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 478aa372e3fSPaul Mullowney 479aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 480aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 481aa372e3fSPaul Mullowney 482afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4839566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 484261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 4851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4869371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4879371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 4889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 489afb2bd1cSJunchao Zhang #endif 490afb2bd1cSJunchao Zhang 491aa372e3fSPaul Mullowney /* perform the solve analysis */ 4929371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4939371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 4941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4959371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 496d49cd2b7SBarry Smith #else 4975f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 498afb2bd1cSJunchao Zhang #endif 4999566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5009566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 501aa372e3fSPaul Mullowney 502da79fbbcSStefano Zampini /* assign the pointer */ 503aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 5042cbc15d9SMark upTriFactor->AA_h = AAUp; 5059566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5069566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5079566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 508da79fbbcSStefano Zampini } else { 50948a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 510da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 511da79fbbcSStefano Zampini offset = nzUpper; 512da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 513da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 514da79fbbcSStefano Zampini 515da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 516da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 517da79fbbcSStefano Zampini 518da79fbbcSStefano Zampini /* decrement the offset */ 519da79fbbcSStefano Zampini offset -= (nz + 1); 520da79fbbcSStefano Zampini 521da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5222cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 5239566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 524da79fbbcSStefano Zampini } 5252cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5269566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 527da79fbbcSStefano Zampini } 528*d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 529*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 530*d71ae5a4SJacob Faibussowitsch } 5319ae82921SPaul Mullowney } 5329ae82921SPaul Mullowney PetscFunctionReturn(0); 5339ae82921SPaul Mullowney } 5349ae82921SPaul Mullowney 535*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 536*d71ae5a4SJacob Faibussowitsch { 5379ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5389ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 5399ae82921SPaul Mullowney IS isrow = a->row, iscol = a->icol; 5409ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5419ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5429ae82921SPaul Mullowney 5439ae82921SPaul Mullowney PetscFunctionBegin; 54428b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 5459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 5472205254eSKarl Rupp 548ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 549aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 5509ae82921SPaul Mullowney 551c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 552e057df02SPaul Mullowney /* lower triangular indices */ 5539566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 554da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 555da79fbbcSStefano Zampini const PetscInt *r; 556da79fbbcSStefano Zampini 5579566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 558aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 559aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 5609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 5619566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 562da79fbbcSStefano Zampini } 5639ae82921SPaul Mullowney 564e057df02SPaul Mullowney /* upper triangular indices */ 5659566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 566da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 567da79fbbcSStefano Zampini const PetscInt *c; 568da79fbbcSStefano Zampini 5699566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 570aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 571aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 5729566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 5739566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 574da79fbbcSStefano Zampini } 5759ae82921SPaul Mullowney PetscFunctionReturn(0); 5769ae82921SPaul Mullowney } 5779ae82921SPaul Mullowney 578*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 579*d71ae5a4SJacob Faibussowitsch { 580087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 581087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 582aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 583aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 584087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 585087f3262SPaul Mullowney PetscScalar *AAUp; 586087f3262SPaul Mullowney PetscScalar *AALo; 587087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 588087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 589087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 590087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 591087f3262SPaul Mullowney 592087f3262SPaul Mullowney PetscFunctionBegin; 593cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 594c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 595087f3262SPaul Mullowney try { 5969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 5979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 598da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 599087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 6009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 6019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 602087f3262SPaul Mullowney 603087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 604087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 605087f3262SPaul Mullowney AiUp[n] = nzUpper; 606087f3262SPaul Mullowney offset = 0; 607087f3262SPaul Mullowney for (i = 0; i < n; i++) { 608087f3262SPaul Mullowney /* set the pointers */ 609087f3262SPaul Mullowney v = aa + ai[i]; 610087f3262SPaul Mullowney vj = aj + ai[i]; 611087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 612087f3262SPaul Mullowney 613087f3262SPaul Mullowney /* first, set the diagonal elements */ 614087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 61509f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 616087f3262SPaul Mullowney AiUp[i] = offset; 61709f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 618087f3262SPaul Mullowney 619087f3262SPaul Mullowney offset += 1; 620087f3262SPaul Mullowney if (nz > 0) { 6219566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 6229566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 623087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 624087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 625087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 626087f3262SPaul Mullowney } 627087f3262SPaul Mullowney offset += nz; 628087f3262SPaul Mullowney } 629087f3262SPaul Mullowney } 630087f3262SPaul Mullowney 631aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6329566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 633da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 634087f3262SPaul Mullowney 635aa372e3fSPaul Mullowney /* Create the matrix description */ 6369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 640afb2bd1cSJunchao Zhang #else 6419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 642afb2bd1cSJunchao Zhang #endif 6439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 645087f3262SPaul Mullowney 646aa372e3fSPaul Mullowney /* set the matrix */ 647aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 648aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 649aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 650aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 651aa372e3fSPaul Mullowney 652aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 653aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 654aa372e3fSPaul Mullowney 655aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 656aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 657aa372e3fSPaul Mullowney 658aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 659aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 660aa372e3fSPaul Mullowney 661afb2bd1cSJunchao Zhang /* set the operation */ 662afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 663afb2bd1cSJunchao Zhang 664afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6659566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 666261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 6671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6699371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 6709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 671afb2bd1cSJunchao Zhang #endif 672afb2bd1cSJunchao Zhang 673aa372e3fSPaul Mullowney /* perform the solve analysis */ 6749371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6759371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 6761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6779371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 678d49cd2b7SBarry Smith #else 6795f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 680afb2bd1cSJunchao Zhang #endif 6819566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 6829566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 683aa372e3fSPaul Mullowney 684da79fbbcSStefano Zampini /* assign the pointer */ 685aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 686aa372e3fSPaul Mullowney 687aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6889566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 689da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 690aa372e3fSPaul Mullowney 691aa372e3fSPaul Mullowney /* Create the matrix description */ 6929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 6939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 696afb2bd1cSJunchao Zhang #else 6979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 698afb2bd1cSJunchao Zhang #endif 6999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 7009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 701aa372e3fSPaul Mullowney 702aa372e3fSPaul Mullowney /* set the operation */ 703aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 704aa372e3fSPaul Mullowney 705aa372e3fSPaul Mullowney /* set the matrix */ 706aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 707aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 708aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 709aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 710aa372e3fSPaul Mullowney 711aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 712aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 713aa372e3fSPaul Mullowney 714aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 715aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 716aa372e3fSPaul Mullowney 717aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 718aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 719aa372e3fSPaul Mullowney 720afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 7219566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 722261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 7231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 7249371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 7259371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 7269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 727afb2bd1cSJunchao Zhang #endif 728afb2bd1cSJunchao Zhang 729aa372e3fSPaul Mullowney /* perform the solve analysis */ 7309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 7319371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 7321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 7339371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 734d49cd2b7SBarry Smith #else 7355f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 736afb2bd1cSJunchao Zhang #endif 7379566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7389566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 739aa372e3fSPaul Mullowney 740da79fbbcSStefano Zampini /* assign the pointer */ 741aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 742087f3262SPaul Mullowney 7439566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 7449566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 7459566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 746da79fbbcSStefano Zampini } else { 747da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 748da79fbbcSStefano Zampini offset = 0; 749da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 750da79fbbcSStefano Zampini /* set the pointers */ 751da79fbbcSStefano Zampini v = aa + ai[i]; 752da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 753da79fbbcSStefano Zampini 754da79fbbcSStefano Zampini /* first, set the diagonal elements */ 755da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 756da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 757da79fbbcSStefano Zampini 758da79fbbcSStefano Zampini offset += 1; 759da79fbbcSStefano Zampini if (nz > 0) { 7609566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 761da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 762da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 763da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 764da79fbbcSStefano Zampini } 765da79fbbcSStefano Zampini offset += nz; 766da79fbbcSStefano Zampini } 767da79fbbcSStefano Zampini } 76828b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 76928b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 770da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 771da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 7729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 773da79fbbcSStefano Zampini } 7749566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 7759566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 776*d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 777*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 778*d71ae5a4SJacob Faibussowitsch } 779087f3262SPaul Mullowney } 780087f3262SPaul Mullowney PetscFunctionReturn(0); 781087f3262SPaul Mullowney } 782087f3262SPaul Mullowney 783*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 784*d71ae5a4SJacob Faibussowitsch { 785087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 786087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 787087f3262SPaul Mullowney IS ip = a->row; 788087f3262SPaul Mullowney PetscBool perm_identity; 789087f3262SPaul Mullowney PetscInt n = A->rmap->n; 790087f3262SPaul Mullowney 791087f3262SPaul Mullowney PetscFunctionBegin; 79228b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 7939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 794ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 795aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 796aa372e3fSPaul Mullowney 797da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 798da79fbbcSStefano Zampini 799087f3262SPaul Mullowney /* lower triangular indices */ 8009566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 801087f3262SPaul Mullowney if (!perm_identity) { 8024e4bbfaaSStefano Zampini IS iip; 803da79fbbcSStefano Zampini const PetscInt *irip, *rip; 8044e4bbfaaSStefano Zampini 8059566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 8069566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 8079566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 808aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 809aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 810aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8114e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 8129566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 8139566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 8149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 8159566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 816da79fbbcSStefano Zampini } 817087f3262SPaul Mullowney PetscFunctionReturn(0); 818087f3262SPaul Mullowney } 819087f3262SPaul Mullowney 820*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 821*d71ae5a4SJacob Faibussowitsch { 822087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 823087f3262SPaul Mullowney IS ip = b->row; 824087f3262SPaul Mullowney PetscBool perm_identity; 825087f3262SPaul Mullowney 826087f3262SPaul Mullowney PetscFunctionBegin; 8279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 8289566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 829ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 830087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8319566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 832087f3262SPaul Mullowney if (perm_identity) { 833087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 834087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8354e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8364e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 837087f3262SPaul Mullowney } else { 838087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 839087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8404e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8414e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 842087f3262SPaul Mullowney } 843087f3262SPaul Mullowney 844087f3262SPaul Mullowney /* get the triangular factors */ 8459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 846087f3262SPaul Mullowney PetscFunctionReturn(0); 847087f3262SPaul Mullowney } 8489ae82921SPaul Mullowney 849*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 850*d71ae5a4SJacob Faibussowitsch { 851bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 852aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 853aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 854da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 855da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 856aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 857aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 858aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 859aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 860b175d8bbSPaul Mullowney 861bda325fcSPaul Mullowney PetscFunctionBegin; 862aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 8639566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 864da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 865aa372e3fSPaul Mullowney 866aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 867aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 868aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 8699371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 870aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 871aa372e3fSPaul Mullowney 872aa372e3fSPaul Mullowney /* Create the matrix description */ 8739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 8749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 8759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 8769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 8779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 878aa372e3fSPaul Mullowney 879aa372e3fSPaul Mullowney /* set the operation */ 880aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 881aa372e3fSPaul Mullowney 882aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 883aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 884afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 885afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 886aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 887afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 888afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 889afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 890aa372e3fSPaul Mullowney 891aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 892afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 8939371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 8949371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 8959371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 8969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 897afb2bd1cSJunchao Zhang #endif 898afb2bd1cSJunchao Zhang 8999566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 9009371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 9019371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 902afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9039371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 904afb2bd1cSJunchao Zhang #else 9059371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 906afb2bd1cSJunchao Zhang #endif 9079566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9089566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 909aa372e3fSPaul Mullowney 910afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9119566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 912261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 9131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9149371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 9159371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 9169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 917afb2bd1cSJunchao Zhang #endif 918afb2bd1cSJunchao Zhang 919afb2bd1cSJunchao Zhang /* perform the solve analysis */ 9209371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 9219371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 9221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9239371c9d4SSatish Balay loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 924d49cd2b7SBarry Smith #else 9255f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 926afb2bd1cSJunchao Zhang #endif 9279566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9289566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 929aa372e3fSPaul Mullowney 930da79fbbcSStefano Zampini /* assign the pointer */ 931aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 932aa372e3fSPaul Mullowney 933aa372e3fSPaul Mullowney /*********************************************/ 934aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 935aa372e3fSPaul Mullowney /*********************************************/ 936aa372e3fSPaul Mullowney 937aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 9389566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 939da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 940aa372e3fSPaul Mullowney 941aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 942aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 943aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 9449371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 945aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 946aa372e3fSPaul Mullowney 947aa372e3fSPaul Mullowney /* Create the matrix description */ 9489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 9499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 9509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 9519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 9529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 953aa372e3fSPaul Mullowney 954aa372e3fSPaul Mullowney /* set the operation */ 955aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 956aa372e3fSPaul Mullowney 957aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 958aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 959afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 960afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 961aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 962afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 963afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 964afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 965aa372e3fSPaul Mullowney 966aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 967afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 9699371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 9709371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 9719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 972afb2bd1cSJunchao Zhang #endif 973afb2bd1cSJunchao Zhang 9749566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 9759371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 9769371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 977afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9789371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 979afb2bd1cSJunchao Zhang #else 9809371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 981afb2bd1cSJunchao Zhang #endif 982d49cd2b7SBarry Smith 9839566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9849566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 985aa372e3fSPaul Mullowney 986afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9879566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 988261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 9891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9909371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9919371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 9929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 993afb2bd1cSJunchao Zhang #endif 994afb2bd1cSJunchao Zhang 995afb2bd1cSJunchao Zhang /* perform the solve analysis */ 9965f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 9979371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9989371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 9991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 10009371c9d4SSatish Balay upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1001d49cd2b7SBarry Smith #else 10025f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 1003afb2bd1cSJunchao Zhang #endif 1004d49cd2b7SBarry Smith 10059566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 10069566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1007aa372e3fSPaul Mullowney 1008da79fbbcSStefano Zampini /* assign the pointer */ 1009aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1010bda325fcSPaul Mullowney PetscFunctionReturn(0); 1011bda325fcSPaul Mullowney } 1012bda325fcSPaul Mullowney 10139371c9d4SSatish Balay struct PetscScalarToPetscInt { 10149371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1015a49f1ed0SStefano Zampini }; 1016a49f1ed0SStefano Zampini 1017*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1018*d71ae5a4SJacob Faibussowitsch { 1019aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1020a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1021bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1022bda325fcSPaul Mullowney cusparseStatus_t stat; 1023aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1024b175d8bbSPaul Mullowney 1025bda325fcSPaul Mullowney PetscFunctionBegin; 10269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1027a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 102828b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1029a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 103008401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 10311a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 10329566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 103448a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1035a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1036aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 10379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1038aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 10399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 10409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1041aa372e3fSPaul Mullowney 1042b06137fdSPaul Mullowney /* set alpha and beta */ 10439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 10449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 10459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 10469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1049b06137fdSPaul Mullowney 1050aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1051aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1052a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1053554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1054554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1055aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1056a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1057aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1058aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1059a3fdcf43SKarl Rupp 1060ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 106181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1062afb2bd1cSJunchao Zhang 1063afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10643606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 10659371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 10669371c9d4SSatish Balay indexBase, cusparse_scalartype); 10679371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10683606e59fSJunchao Zhang #else 10693606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 10703606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 10713606e59fSJunchao Zhang 10723606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 10733606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 10743606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 10753606e59fSJunchao Zhang */ 10763606e59fSJunchao Zhang if (matrixT->num_entries) { 10779371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 10789371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10793606e59fSJunchao Zhang 10803606e59fSJunchao Zhang } else { 10813606e59fSJunchao Zhang matstructT->matDescr = NULL; 10823606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 10833606e59fSJunchao Zhang } 10843606e59fSJunchao Zhang #endif 1085afb2bd1cSJunchao Zhang #endif 1086aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1087afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1088afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1089afb2bd1cSJunchao Zhang #else 1090aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 109151c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 109251c6d536SStefano Zampini /* First convert HYB to CSR */ 1093aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1094aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1095aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1096aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1097aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1098aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1099aa372e3fSPaul Mullowney 11009371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 11019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1102aa372e3fSPaul Mullowney 1103aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1104aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1105aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1106aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1107aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1108aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1109aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1110aa372e3fSPaul Mullowney 11119371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 11129371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 11139371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1114aa372e3fSPaul Mullowney 1115aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1116aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 11179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 11189371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 11199371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 11209371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1121aa372e3fSPaul Mullowney 1122aa372e3fSPaul Mullowney /* assign the pointer */ 1123aa372e3fSPaul Mullowney matstructT->mat = hybMat; 11241a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1125aa372e3fSPaul Mullowney /* delete temporaries */ 1126aa372e3fSPaul Mullowney if (tempT) { 1127aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1128aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1129aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1130aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1131087f3262SPaul Mullowney } 1132aa372e3fSPaul Mullowney if (temp) { 1133aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1134aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1135aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1136aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1137aa372e3fSPaul Mullowney } 1138afb2bd1cSJunchao Zhang #endif 1139aa372e3fSPaul Mullowney } 1140a49f1ed0SStefano Zampini } 1141a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1142a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1143a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 114428b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 114528b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 114628b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 114728b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 114828b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 114928b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 115028b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 115128b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1152a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1153a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1154a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 11559566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1156a49f1ed0SStefano Zampini } 1157a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1158a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1159792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1160a49f1ed0SStefano Zampini 1161a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1162a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1163a49f1ed0SStefano Zampini void *csr2cscBuffer; 1164a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 11659371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 11669371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 11679371c9d4SSatish Balay PetscCallCUSPARSE(stat); 11689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1169a49f1ed0SStefano Zampini #endif 1170a49f1ed0SStefano Zampini 11711a2c6b5cSJunchao Zhang if (matrix->num_entries) { 11721a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 11731a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 11741a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 11751a2c6b5cSJunchao Zhang 11761a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 11771a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 11781a2c6b5cSJunchao Zhang */ 11799371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1180a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11819371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 11829371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1183a49f1ed0SStefano Zampini #else 11849371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 11859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1186a49f1ed0SStefano Zampini #endif 11871a2c6b5cSJunchao Zhang } else { 11881a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 11891a2c6b5cSJunchao Zhang } 11901a2c6b5cSJunchao Zhang 1191a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1192792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1193a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11949566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1195a49f1ed0SStefano Zampini #endif 1196a49f1ed0SStefano Zampini } 11979371c9d4SSatish Balay PetscCallThrust( 11989371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1199a49f1ed0SStefano Zampini } 12009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12019566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1202213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1203213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1204aa372e3fSPaul Mullowney /* assign the pointer */ 1205aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 12061a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1207bda325fcSPaul Mullowney PetscFunctionReturn(0); 1208bda325fcSPaul Mullowney } 1209bda325fcSPaul Mullowney 1210a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1211*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1212*d71ae5a4SJacob Faibussowitsch { 1213c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1214465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1215465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1216465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1217465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1218bda325fcSPaul Mullowney cusparseStatus_t stat; 1219bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1220aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1221aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1222aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1223bda325fcSPaul Mullowney 1224bda325fcSPaul Mullowney PetscFunctionBegin; 1225aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1226aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 12279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1228aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1229aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1230bda325fcSPaul Mullowney } 1231bda325fcSPaul Mullowney 1232bda325fcSPaul Mullowney /* Get the GPU pointers */ 12339566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 12349566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1235c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1236c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1237bda325fcSPaul Mullowney 12389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1239aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 12409371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1241aa372e3fSPaul Mullowney 1242aa372e3fSPaul Mullowney /* First, solve U */ 12439371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 12441b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1245afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1246afb2bd1cSJunchao Zhang #endif 12479371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 12481b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12499371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 12509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1251d49cd2b7SBarry Smith #else 12529371c9d4SSatish Balay tempGPU->data().get()); 12539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1254afb2bd1cSJunchao Zhang #endif 1255aa372e3fSPaul Mullowney 1256aa372e3fSPaul Mullowney /* Then, solve L */ 12579371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 12581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1259afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1260afb2bd1cSJunchao Zhang #endif 12619371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1262d49cd2b7SBarry Smith tempGPU->data().get(), 12631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12649371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 12659371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1266d49cd2b7SBarry Smith #else 12679371c9d4SSatish Balay xarray); 12689371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1269afb2bd1cSJunchao Zhang #endif 1270aa372e3fSPaul Mullowney 1271aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 12729371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1273aa372e3fSPaul Mullowney 1274aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1275a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1276bda325fcSPaul Mullowney 1277bda325fcSPaul Mullowney /* restore */ 12789566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 12799566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 12809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1282bda325fcSPaul Mullowney PetscFunctionReturn(0); 1283bda325fcSPaul Mullowney } 1284bda325fcSPaul Mullowney 1285*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1286*d71ae5a4SJacob Faibussowitsch { 1287465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1288465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1289bda325fcSPaul Mullowney cusparseStatus_t stat; 1290bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1291aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1292aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1293aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1294bda325fcSPaul Mullowney 1295bda325fcSPaul Mullowney PetscFunctionBegin; 1296aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1297aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 12989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1299aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1300aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1301bda325fcSPaul Mullowney } 1302bda325fcSPaul Mullowney 1303bda325fcSPaul Mullowney /* Get the GPU pointers */ 13049566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13059566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1306bda325fcSPaul Mullowney 13079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1308aa372e3fSPaul Mullowney /* First, solve U */ 13099371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 13101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1311afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1312afb2bd1cSJunchao Zhang #endif 13139371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 13141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13159371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 13169371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1317d49cd2b7SBarry Smith #else 13189371c9d4SSatish Balay tempGPU->data().get()); 13199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1320afb2bd1cSJunchao Zhang #endif 1321aa372e3fSPaul Mullowney 1322aa372e3fSPaul Mullowney /* Then, solve L */ 13239371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 13241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1325afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1326afb2bd1cSJunchao Zhang #endif 13279371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1328d49cd2b7SBarry Smith tempGPU->data().get(), 13291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13309371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 13319371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1332d49cd2b7SBarry Smith #else 13339371c9d4SSatish Balay xarray); 13349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1335afb2bd1cSJunchao Zhang #endif 1336bda325fcSPaul Mullowney 1337bda325fcSPaul Mullowney /* restore */ 13389566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 13399566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 13409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1342bda325fcSPaul Mullowney PetscFunctionReturn(0); 1343bda325fcSPaul Mullowney } 1344bda325fcSPaul Mullowney 1345*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1346*d71ae5a4SJacob Faibussowitsch { 1347465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1348465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1349465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1350465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 13519ae82921SPaul Mullowney cusparseStatus_t stat; 13529ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1353aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1354aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1355aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 13569ae82921SPaul Mullowney 13579ae82921SPaul Mullowney PetscFunctionBegin; 1358ebc8f436SDominic Meiser 1359e057df02SPaul Mullowney /* Get the GPU pointers */ 13609566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13619566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1362c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1363c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 13649ae82921SPaul Mullowney 13659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1366aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 13679371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1368aa372e3fSPaul Mullowney 1369aa372e3fSPaul Mullowney /* Next, solve L */ 13709371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 13711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1372afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1373afb2bd1cSJunchao Zhang #endif 13749371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1375d49cd2b7SBarry Smith tempGPU->data().get(), 13761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13779371c9d4SSatish Balay xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 13789371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1379d49cd2b7SBarry Smith #else 13809371c9d4SSatish Balay xarray); 13819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1382afb2bd1cSJunchao Zhang #endif 1383aa372e3fSPaul Mullowney 1384aa372e3fSPaul Mullowney /* Then, solve U */ 13859371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 13861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1387afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1388afb2bd1cSJunchao Zhang #endif 13899371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 13901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13919371c9d4SSatish Balay tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 13929371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1393d49cd2b7SBarry Smith #else 13949371c9d4SSatish Balay tempGPU->data().get()); 13959371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1396afb2bd1cSJunchao Zhang #endif 1397d49cd2b7SBarry Smith 13984e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 13999371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 14009ae82921SPaul Mullowney 14019566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 14029566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 14039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 14059ae82921SPaul Mullowney PetscFunctionReturn(0); 14069ae82921SPaul Mullowney } 14079ae82921SPaul Mullowney 1408*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1409*d71ae5a4SJacob Faibussowitsch { 1410465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1411465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 14129ae82921SPaul Mullowney cusparseStatus_t stat; 14139ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1414aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1415aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1416aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 14179ae82921SPaul Mullowney 14189ae82921SPaul Mullowney PetscFunctionBegin; 1419e057df02SPaul Mullowney /* Get the GPU pointers */ 14209566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 14219566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 14229ae82921SPaul Mullowney 14239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1424aa372e3fSPaul Mullowney /* First, solve L */ 14259371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 14261b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1427afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1428afb2bd1cSJunchao Zhang #endif 14299371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 14301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 14319371c9d4SSatish Balay tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 14329371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1433d49cd2b7SBarry Smith #else 14349371c9d4SSatish Balay tempGPU->data().get()); 14359371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1436afb2bd1cSJunchao Zhang #endif 1437d49cd2b7SBarry Smith 1438aa372e3fSPaul Mullowney /* Next, solve U */ 14399371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 14401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1441afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1442afb2bd1cSJunchao Zhang #endif 14439371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1444d49cd2b7SBarry Smith tempGPU->data().get(), 14451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 14469371c9d4SSatish Balay xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 14479371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1448d49cd2b7SBarry Smith #else 14499371c9d4SSatish Balay xarray); 14509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1451afb2bd1cSJunchao Zhang #endif 14529ae82921SPaul Mullowney 14539566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 14549566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 14559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 14579ae82921SPaul Mullowney PetscFunctionReturn(0); 14589ae82921SPaul Mullowney } 14599ae82921SPaul Mullowney 1460da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1461da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 1462*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1463*d71ae5a4SJacob Faibussowitsch { 1464da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1465da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1466da112707SJunchao Zhang const PetscScalar *barray; 1467da112707SJunchao Zhang PetscScalar *xarray; 1468da112707SJunchao Zhang 1469da112707SJunchao Zhang PetscFunctionBegin; 1470da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1471da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1472da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1473da112707SJunchao Zhang 1474da112707SJunchao Zhang /* Solve L*y = b */ 1475da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1476da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 14779371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 14789371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 147912ba2bc6SJunchao Zhang fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1480da112707SJunchao Zhang 1481da112707SJunchao Zhang /* Solve U*x = y */ 1482da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 14839371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 14849371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1485da112707SJunchao Zhang 1486da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1487da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1488da112707SJunchao Zhang 1489da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1490da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1491da112707SJunchao Zhang PetscFunctionReturn(0); 1492da112707SJunchao Zhang } 1493da112707SJunchao Zhang 1494*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) 1495*d71ae5a4SJacob Faibussowitsch { 1496da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1497da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1498da112707SJunchao Zhang const PetscScalar *barray; 1499da112707SJunchao Zhang PetscScalar *xarray; 1500da112707SJunchao Zhang 1501da112707SJunchao Zhang PetscFunctionBegin; 150212ba2bc6SJunchao Zhang if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1503da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 15049371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 15059371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1506da112707SJunchao Zhang 1507da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 15089371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1509da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 151012ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 151112ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 151212ba2bc6SJunchao Zhang } 1513da112707SJunchao Zhang 151412ba2bc6SJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 15159371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1516da112707SJunchao Zhang 15179371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 151812ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1519da112707SJunchao Zhang } 1520da112707SJunchao Zhang 1521da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1522da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1523da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1524da112707SJunchao Zhang 1525da112707SJunchao Zhang /* Solve Ut*y = b */ 1526da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1527da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 15289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 15299371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1530da112707SJunchao Zhang 1531da112707SJunchao Zhang /* Solve Lt*x = y */ 1532da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 15339371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 15349371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1535da112707SJunchao Zhang 1536da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1537da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1538da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1539da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1540da112707SJunchao Zhang PetscFunctionReturn(0); 1541da112707SJunchao Zhang } 1542da112707SJunchao Zhang 1543*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) 1544*d71ae5a4SJacob Faibussowitsch { 1545da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1546da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1547da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1548da112707SJunchao Zhang CsrMatrix *Acsr; 1549da112707SJunchao Zhang PetscInt m, nz; 1550da112707SJunchao Zhang PetscBool flg; 1551da112707SJunchao Zhang 1552da112707SJunchao Zhang PetscFunctionBegin; 1553da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1554da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1555da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1556da112707SJunchao Zhang } 1557da112707SJunchao Zhang 1558da112707SJunchao Zhang /* Copy A's value to fact */ 1559da112707SJunchao Zhang m = fact->rmap->n; 1560da112707SJunchao Zhang nz = aij->nz; 1561da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1562da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1563da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1564da112707SJunchao Zhang 1565da112707SJunchao Zhang /* Factorize fact inplace */ 15669371c9d4SSatish Balay if (m) 15679371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 15689371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1569da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1570da112707SJunchao Zhang int numerical_zero; 1571da112707SJunchao Zhang cusparseStatus_t status; 1572da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1573da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1574da112707SJunchao Zhang } 1575da112707SJunchao Zhang 157612ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 157712ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 157812ba2bc6SJunchao Zhang */ 15799371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1580da112707SJunchao Zhang 15819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1582da112707SJunchao Zhang 158312ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 158412ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 158512ba2bc6SJunchao Zhang 1586da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1587da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1588da112707SJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1589da112707SJunchao Zhang fact->ops->matsolve = NULL; 1590da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1591da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1592da112707SJunchao Zhang PetscFunctionReturn(0); 1593da112707SJunchao Zhang } 1594da112707SJunchao Zhang 1595*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1596*d71ae5a4SJacob Faibussowitsch { 1597da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1598da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1599da112707SJunchao Zhang PetscInt m, nz; 1600da112707SJunchao Zhang 1601da112707SJunchao Zhang PetscFunctionBegin; 1602da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1603da112707SJunchao Zhang PetscInt i; 1604da112707SJunchao Zhang PetscBool flg, missing; 1605da112707SJunchao Zhang 1606da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1607da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1608da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1609da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1610da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1611da112707SJunchao Zhang } 1612da112707SJunchao Zhang 1613da112707SJunchao Zhang /* Free the old stale stuff */ 1614da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1615da112707SJunchao Zhang 1616da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1617da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1618da112707SJunchao Zhang */ 1619da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1620da112707SJunchao Zhang 1621da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1622da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1623da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1624da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1625da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1626da112707SJunchao Zhang 1627da112707SJunchao Zhang aij->row = NULL; 1628da112707SJunchao Zhang aij->col = NULL; 1629da112707SJunchao Zhang 1630da112707SJunchao Zhang /* ====================================================================== */ 1631da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1632da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1633da112707SJunchao Zhang /* ====================================================================== */ 1634da112707SJunchao Zhang const int *Ai, *Aj; 1635da112707SJunchao Zhang 1636da112707SJunchao Zhang m = fact->rmap->n; 1637da112707SJunchao Zhang nz = aij->nz; 1638da112707SJunchao Zhang 1639da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1640da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1641da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1642da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1643da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1644da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1645da112707SJunchao Zhang 1646da112707SJunchao Zhang /* ====================================================================== */ 1647da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1648da112707SJunchao Zhang /* ====================================================================== */ 1649da112707SJunchao Zhang cusparseFillMode_t fillMode; 1650da112707SJunchao Zhang cusparseDiagType_t diagType; 1651da112707SJunchao Zhang 1652da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1653da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1654da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1655da112707SJunchao Zhang 1656da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1657da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1658da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1659da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1660da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1661da112707SJunchao Zhang */ 1662da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1663da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 16649371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16659371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16669371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1667da112707SJunchao Zhang 1668da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1669da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 16709371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16719371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16729371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1673da112707SJunchao Zhang 1674da112707SJunchao Zhang /* ========================================================================= */ 1675da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1676da112707SJunchao Zhang /* ========================================================================= */ 1677da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 16789371c9d4SSatish Balay if (m) 16799371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 16809371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1681da112707SJunchao Zhang 1682da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1683da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1684da112707SJunchao Zhang 1685da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1686da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1687da112707SJunchao Zhang 1688da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 16899371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1690da112707SJunchao Zhang 1691da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 16929371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1693da112707SJunchao Zhang 1694da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 169512ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 169612ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 169712ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1698da112707SJunchao Zhang */ 169912ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 170012ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 170112ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1702da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 170312ba2bc6SJunchao Zhang } else { 170412ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 170512ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1706da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 170712ba2bc6SJunchao Zhang } 1708da112707SJunchao Zhang 1709da112707SJunchao Zhang /* ========================================================================== */ 1710da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1711da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1712da112707SJunchao Zhang /* ========================================================================== */ 1713da112707SJunchao Zhang int structural_zero; 1714da112707SJunchao Zhang cusparseStatus_t status; 1715da112707SJunchao Zhang 1716da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 17179371c9d4SSatish Balay if (m) 17189371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 17199371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1720da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1721da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1722da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1723da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1724da112707SJunchao Zhang } 1725da112707SJunchao Zhang 1726da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 17270dd8c0acSJunchao Zhang { 1728da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 17290dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1730da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1731da112707SJunchao Zhang 1732da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1733da112707SJunchao Zhang Ai = Aseq->i; 1734da112707SJunchao Zhang Adiag = Aseq->diag; 1735da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1736da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1737da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1738da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1739da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1740da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1741da112707SJunchao Zhang */ 1742da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1743da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1744da112707SJunchao Zhang } 1745da112707SJunchao Zhang } 1746da112707SJunchao Zhang fs->numericFactFlops = flops; 17470dd8c0acSJunchao Zhang } 1748da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1749da112707SJunchao Zhang PetscFunctionReturn(0); 1750da112707SJunchao Zhang } 1751da112707SJunchao Zhang 1752*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1753*d71ae5a4SJacob Faibussowitsch { 1754da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756da112707SJunchao Zhang const PetscScalar *barray; 1757da112707SJunchao Zhang PetscScalar *xarray; 1758da112707SJunchao Zhang 1759da112707SJunchao Zhang PetscFunctionBegin; 1760da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1761da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1762da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1763da112707SJunchao Zhang 1764da112707SJunchao Zhang /* Solve L*y = b */ 1765da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1766da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 17679371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 17689371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1769da112707SJunchao Zhang 1770da112707SJunchao Zhang /* Solve Lt*x = y */ 1771da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 17729371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 17739371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1774da112707SJunchao Zhang 1775da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1776da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1777da112707SJunchao Zhang 1778da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1779da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1780da112707SJunchao Zhang PetscFunctionReturn(0); 1781da112707SJunchao Zhang } 1782da112707SJunchao Zhang 1783*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) 1784*d71ae5a4SJacob Faibussowitsch { 1785da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1786da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1787da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1788da112707SJunchao Zhang CsrMatrix *Acsr; 1789da112707SJunchao Zhang PetscInt m, nz; 1790da112707SJunchao Zhang PetscBool flg; 1791da112707SJunchao Zhang 1792da112707SJunchao Zhang PetscFunctionBegin; 1793da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1794da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1795da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1796da112707SJunchao Zhang } 1797da112707SJunchao Zhang 1798da112707SJunchao Zhang /* Copy A's value to fact */ 1799da112707SJunchao Zhang m = fact->rmap->n; 1800da112707SJunchao Zhang nz = aij->nz; 1801da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1802da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1803da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1804da112707SJunchao Zhang 1805da112707SJunchao Zhang /* Factorize fact inplace */ 1806da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1807da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1808da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1809da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1810da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1811da112707SJunchao Zhang */ 18129371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1813da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1814da112707SJunchao Zhang int numerical_zero; 1815da112707SJunchao Zhang cusparseStatus_t status; 1816da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1817da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1818da112707SJunchao Zhang } 1819da112707SJunchao Zhang 18209371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1821da112707SJunchao Zhang 1822da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1823da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1824da112707SJunchao Zhang */ 18259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1826da112707SJunchao Zhang 1827da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1828da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1829da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1830da112707SJunchao Zhang fact->ops->matsolve = NULL; 1831da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1832da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1833da112707SJunchao Zhang PetscFunctionReturn(0); 1834da112707SJunchao Zhang } 1835da112707SJunchao Zhang 1836*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1837*d71ae5a4SJacob Faibussowitsch { 1838da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1839da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1840da112707SJunchao Zhang PetscInt m, nz; 1841da112707SJunchao Zhang 1842da112707SJunchao Zhang PetscFunctionBegin; 1843da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1844da112707SJunchao Zhang PetscInt i; 1845da112707SJunchao Zhang PetscBool flg, missing; 1846da112707SJunchao Zhang 1847da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1848da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1849da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1850da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1851da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1852da112707SJunchao Zhang } 1853da112707SJunchao Zhang 1854da112707SJunchao Zhang /* Free the old stale stuff */ 1855da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1856da112707SJunchao Zhang 1857da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1858da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1859da112707SJunchao Zhang */ 1860da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1861da112707SJunchao Zhang 1862da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1863da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 1864da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1865da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1866da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1867da112707SJunchao Zhang 1868da112707SJunchao Zhang aij->row = NULL; 1869da112707SJunchao Zhang aij->col = NULL; 1870da112707SJunchao Zhang 1871da112707SJunchao Zhang /* ====================================================================== */ 1872da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1873da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1874da112707SJunchao Zhang /* ====================================================================== */ 1875da112707SJunchao Zhang const int *Ai, *Aj; 1876da112707SJunchao Zhang 1877da112707SJunchao Zhang m = fact->rmap->n; 1878da112707SJunchao Zhang nz = aij->nz; 1879da112707SJunchao Zhang 1880da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1881da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1882da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1883da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1884da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1885da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1886da112707SJunchao Zhang 1887da112707SJunchao Zhang /* ====================================================================== */ 1888da112707SJunchao Zhang /* Create mat descriptors for M, L */ 1889da112707SJunchao Zhang /* ====================================================================== */ 1890da112707SJunchao Zhang cusparseFillMode_t fillMode; 1891da112707SJunchao Zhang cusparseDiagType_t diagType; 1892da112707SJunchao Zhang 1893da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1894da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1895da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1896da112707SJunchao Zhang 1897da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1898da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1899da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1900da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1901da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1902da112707SJunchao Zhang */ 1903da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1904da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 19059371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 19069371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 19079371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1908da112707SJunchao Zhang 1909da112707SJunchao Zhang /* ========================================================================= */ 1910da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1911da112707SJunchao Zhang /* ========================================================================= */ 1912da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 19139371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1914da112707SJunchao Zhang 1915da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1916da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1917da112707SJunchao Zhang 1918da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1919da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1920da112707SJunchao Zhang 1921da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 19229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1923da112707SJunchao Zhang 1924da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 19259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1926da112707SJunchao Zhang 192712ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 192812ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 192912ba2bc6SJunchao Zhang */ 193012ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 193112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 193212ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1933da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 193412ba2bc6SJunchao Zhang } else { 193512ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 193612ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 193712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 193812ba2bc6SJunchao Zhang } 1939da112707SJunchao Zhang 1940da112707SJunchao Zhang /* ========================================================================== */ 1941da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 1942da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 1943da112707SJunchao Zhang /* ========================================================================== */ 1944da112707SJunchao Zhang int structural_zero; 1945da112707SJunchao Zhang cusparseStatus_t status; 1946da112707SJunchao Zhang 1947da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 19489371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1949da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1950da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1951da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1952da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1953da112707SJunchao Zhang } 1954da112707SJunchao Zhang 1955da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 19560dd8c0acSJunchao Zhang { 1957da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 19580dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 1959da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1960da112707SJunchao Zhang 1961da112707SJunchao Zhang Ai = Aseq->i; 1962da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1963da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1964da112707SJunchao Zhang if (nzRow > 1) { 1965da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1966da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1967da112707SJunchao Zhang */ 1968da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1969da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1970da112707SJunchao Zhang } 1971da112707SJunchao Zhang } 1972da112707SJunchao Zhang fs->numericFactFlops = flops; 19730dd8c0acSJunchao Zhang } 1974da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1975da112707SJunchao Zhang PetscFunctionReturn(0); 1976da112707SJunchao Zhang } 1977da112707SJunchao Zhang #endif 1978da112707SJunchao Zhang 1979*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1980*d71ae5a4SJacob Faibussowitsch { 1981da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1982da112707SJunchao Zhang 1983da112707SJunchao Zhang PetscFunctionBegin; 1984da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1985bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1986bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) { 1987da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 1988da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 1989bc996fdcSJunchao Zhang } 1990da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 1991da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1992da112707SJunchao Zhang } else 1993da112707SJunchao Zhang #endif 1994da112707SJunchao Zhang { 1995da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1996da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1997da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1998da112707SJunchao Zhang } 1999da112707SJunchao Zhang PetscFunctionReturn(0); 2000da112707SJunchao Zhang } 2001da112707SJunchao Zhang 2002*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2003*d71ae5a4SJacob Faibussowitsch { 2004da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2005da112707SJunchao Zhang 2006da112707SJunchao Zhang PetscFunctionBegin; 2007da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2008da112707SJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2009da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2010da112707SJunchao Zhang PetscFunctionReturn(0); 2011da112707SJunchao Zhang } 2012da112707SJunchao Zhang 2013*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2014*d71ae5a4SJacob Faibussowitsch { 2015da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2016da112707SJunchao Zhang 2017da112707SJunchao Zhang PetscFunctionBegin; 2018da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 2019bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 2020bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2021da112707SJunchao Zhang if (!info->levels && perm_identity) { 2022da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2023da112707SJunchao Zhang } else 2024da112707SJunchao Zhang #endif 2025da112707SJunchao Zhang { 2026da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2027da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2028da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2029da112707SJunchao Zhang } 2030da112707SJunchao Zhang PetscFunctionReturn(0); 2031da112707SJunchao Zhang } 2032da112707SJunchao Zhang 2033*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2034*d71ae5a4SJacob Faibussowitsch { 2035da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2036da112707SJunchao Zhang 2037da112707SJunchao Zhang PetscFunctionBegin; 2038da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2039da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2040da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2041da112707SJunchao Zhang PetscFunctionReturn(0); 2042da112707SJunchao Zhang } 2043da112707SJunchao Zhang 2044*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) 2045*d71ae5a4SJacob Faibussowitsch { 2046841d4cb1SJunchao Zhang PetscFunctionBegin; 2047841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 2048841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2049841d4cb1SJunchao Zhang } 2050841d4cb1SJunchao Zhang 2051841d4cb1SJunchao Zhang /*MC 2052841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 205311a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2054841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2055841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 205611a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2057841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2058841d4cb1SJunchao Zhang 2059841d4cb1SJunchao Zhang Level: beginner 2060841d4cb1SJunchao Zhang 206111a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2062841d4cb1SJunchao Zhang M*/ 2063841d4cb1SJunchao Zhang 2064*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2065*d71ae5a4SJacob Faibussowitsch { 2066841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2067bc996fdcSJunchao Zhang PetscBool factOnDevice, factOnHost; 2068bc996fdcSJunchao Zhang char *prefix; 2069bc996fdcSJunchao Zhang char factPlace[32] = "device"; /* the default */ 2070841d4cb1SJunchao Zhang 2071841d4cb1SJunchao Zhang PetscFunctionBegin; 2072841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2073841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2074841d4cb1SJunchao Zhang (*B)->factortype = ftype; 2075841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2076841d4cb1SJunchao Zhang 2077bc996fdcSJunchao Zhang prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2078bc996fdcSJunchao Zhang PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2079bc996fdcSJunchao Zhang PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2080bc996fdcSJunchao Zhang PetscOptionsEnd(); 2081bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2082bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2083bc996fdcSJunchao Zhang PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2084bc996fdcSJunchao Zhang ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2085bc996fdcSJunchao Zhang 2086841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2087841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2088841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2089841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2090841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2091841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2092841d4cb1SJunchao Zhang } else { 2093841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2094841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2095841d4cb1SJunchao Zhang } 2096841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2097841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2098841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2099841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2100841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2101841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2102841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2103841d4cb1SJunchao Zhang } else { 2104841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2105841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2106841d4cb1SJunchao Zhang } 2107841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2108841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2109841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2110841d4cb1SJunchao Zhang 2111841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2112841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2113841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2114841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2115841d4cb1SJunchao Zhang } 2116841d4cb1SJunchao Zhang 2117*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2118*d71ae5a4SJacob Faibussowitsch { 21197e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 21207e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 21210dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500 2122da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 21230dd8c0acSJunchao Zhang #endif 21247e8381f9SStefano Zampini 21257e8381f9SStefano Zampini PetscFunctionBegin; 21267e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 21279566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2128da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2129da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 21309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2131da112707SJunchao Zhang } 2132da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500 2133da112707SJunchao Zhang else if (fs->csrVal) { 2134da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2135da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2136da112707SJunchao Zhang } 2137da112707SJunchao Zhang #endif 21389371c9d4SSatish Balay else 21399371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 21409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 21419566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 21427e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 21437e8381f9SStefano Zampini } 21447e8381f9SStefano Zampini PetscFunctionReturn(0); 21457e8381f9SStefano Zampini } 21467e8381f9SStefano Zampini 2147*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2148*d71ae5a4SJacob Faibussowitsch { 21497e8381f9SStefano Zampini PetscFunctionBegin; 21509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 215167a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 215267a45760SJunchao Zhang PetscFunctionReturn(0); 215367a45760SJunchao Zhang } 215467a45760SJunchao Zhang 2155*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2156*d71ae5a4SJacob Faibussowitsch { 215767a45760SJunchao Zhang PetscFunctionBegin; 21587e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 215967a45760SJunchao Zhang *array = NULL; 216067a45760SJunchao Zhang PetscFunctionReturn(0); 216167a45760SJunchao Zhang } 216267a45760SJunchao Zhang 2163*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2164*d71ae5a4SJacob Faibussowitsch { 216567a45760SJunchao Zhang PetscFunctionBegin; 21669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 216767a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 216867a45760SJunchao Zhang PetscFunctionReturn(0); 216967a45760SJunchao Zhang } 217067a45760SJunchao Zhang 2171*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2172*d71ae5a4SJacob Faibussowitsch { 217367a45760SJunchao Zhang PetscFunctionBegin; 217467a45760SJunchao Zhang *array = NULL; 217567a45760SJunchao Zhang PetscFunctionReturn(0); 217667a45760SJunchao Zhang } 217767a45760SJunchao Zhang 2178*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2179*d71ae5a4SJacob Faibussowitsch { 218067a45760SJunchao Zhang PetscFunctionBegin; 218167a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 218267a45760SJunchao Zhang PetscFunctionReturn(0); 218367a45760SJunchao Zhang } 218467a45760SJunchao Zhang 2185*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2186*d71ae5a4SJacob Faibussowitsch { 218767a45760SJunchao Zhang PetscFunctionBegin; 218867a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 218967a45760SJunchao Zhang *array = NULL; 21907e8381f9SStefano Zampini PetscFunctionReturn(0); 21917e8381f9SStefano Zampini } 21927e8381f9SStefano Zampini 2193*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2194*d71ae5a4SJacob Faibussowitsch { 21957ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 21967ee59b9bSJunchao Zhang CsrMatrix *matrix; 21977ee59b9bSJunchao Zhang 21987ee59b9bSJunchao Zhang PetscFunctionBegin; 21997ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 22007ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 22017ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 22027ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 22037ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 22047ee59b9bSJunchao Zhang 22057ee59b9bSJunchao Zhang if (i) { 22067ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 22077ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 22087ee59b9bSJunchao Zhang #else 22097ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 22107ee59b9bSJunchao Zhang #endif 22117ee59b9bSJunchao Zhang } 22127ee59b9bSJunchao Zhang if (j) { 22137ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 22147ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 22157ee59b9bSJunchao Zhang #else 22167ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 22177ee59b9bSJunchao Zhang #endif 22187ee59b9bSJunchao Zhang } 22197ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 22207ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 22217ee59b9bSJunchao Zhang PetscFunctionReturn(0); 22227ee59b9bSJunchao Zhang } 22237ee59b9bSJunchao Zhang 2224*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2225*d71ae5a4SJacob Faibussowitsch { 2226aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 22277c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 22289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2229213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2230aa372e3fSPaul Mullowney cusparseStatus_t stat; 2231abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 22329ae82921SPaul Mullowney 22339ae82921SPaul Mullowney PetscFunctionBegin; 223428b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2235c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2236a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2237a49f1ed0SStefano Zampini CsrMatrix *matrix; 2238afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 223985ba7357SStefano Zampini 224008401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 22419566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2242afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 22439566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 22449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 22459566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 22469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 224734d6c7a5SJose E. Roman } else { 2248abb89eb1SStefano Zampini PetscInt nnz; 22499566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 22509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 22519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 22527c700b8dSJunchao Zhang delete cusparsestruct->workVector; 225381902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2254a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2255a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 22569ae82921SPaul Mullowney try { 22579ae82921SPaul Mullowney if (a->compressedrow.use) { 22589ae82921SPaul Mullowney m = a->compressedrow.nrows; 22599ae82921SPaul Mullowney ii = a->compressedrow.i; 22609ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 22619ae82921SPaul Mullowney } else { 2262213423ffSJunchao Zhang m = A->rmap->n; 2263213423ffSJunchao Zhang ii = a->i; 2264e6e9a74fSStefano Zampini ridx = NULL; 22659ae82921SPaul Mullowney } 226608401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 22679371c9d4SSatish Balay if (!a->a) { 22689371c9d4SSatish Balay nnz = ii[m]; 22699371c9d4SSatish Balay both = PETSC_FALSE; 22709371c9d4SSatish Balay } else nnz = a->nz; 227108401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 22729ae82921SPaul Mullowney 227385ba7357SStefano Zampini /* create cusparse matrix */ 2274abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2275aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 22769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 22779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 22789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 22799ae82921SPaul Mullowney 22809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 22819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 22829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 22839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2287b06137fdSPaul Mullowney 2288aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2289aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2290aa372e3fSPaul Mullowney /* set the matrix */ 2291afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2292afb2bd1cSJunchao Zhang mat->num_rows = m; 2293afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2294abb89eb1SStefano Zampini mat->num_entries = nnz; 2295afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2296afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 22979ae82921SPaul Mullowney 2298abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2299abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2300aa372e3fSPaul Mullowney 2301abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2302abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2303aa372e3fSPaul Mullowney 2304aa372e3fSPaul Mullowney /* assign the pointer */ 2305afb2bd1cSJunchao Zhang matstruct->mat = mat; 2306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2307afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 23089371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 23099371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 23109371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2311afb2bd1cSJunchao Zhang } 2312afb2bd1cSJunchao Zhang #endif 2313aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2314afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2315afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2316afb2bd1cSJunchao Zhang #else 2317afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2318afb2bd1cSJunchao Zhang mat->num_rows = m; 2319afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2320abb89eb1SStefano Zampini mat->num_entries = nnz; 2321afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2322afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2323aa372e3fSPaul Mullowney 2324abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2325abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2326aa372e3fSPaul Mullowney 2327abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2328abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2329aa372e3fSPaul Mullowney 2330aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 23319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 23329371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 23339371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 23349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2335aa372e3fSPaul Mullowney /* assign the pointer */ 2336aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2337aa372e3fSPaul Mullowney 2338afb2bd1cSJunchao Zhang if (mat) { 2339afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2340afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2341afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2342afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2343087f3262SPaul Mullowney } 2344afb2bd1cSJunchao Zhang #endif 2345087f3262SPaul Mullowney } 2346ca45077fSPaul Mullowney 2347aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2348213423ffSJunchao Zhang if (a->compressedrow.use) { 2349213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2350aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2351aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2352213423ffSJunchao Zhang tmp = m; 2353213423ffSJunchao Zhang } else { 2354213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2355213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2356213423ffSJunchao Zhang tmp = 0; 2357213423ffSJunchao Zhang } 23589566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2359aa372e3fSPaul Mullowney 2360aa372e3fSPaul Mullowney /* assign the pointer */ 2361aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 2362*d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 2363*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2364*d71ae5a4SJacob Faibussowitsch } 23659566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 23669566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 236734d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 236834d6c7a5SJose E. Roman } 2369abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 23709ae82921SPaul Mullowney } 23719ae82921SPaul Mullowney PetscFunctionReturn(0); 23729ae82921SPaul Mullowney } 23739ae82921SPaul Mullowney 23749371c9d4SSatish Balay struct VecCUDAPlusEquals { 2375aa372e3fSPaul Mullowney template <typename Tuple> 2376*d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2377*d71ae5a4SJacob Faibussowitsch { 2378aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2379aa372e3fSPaul Mullowney } 2380aa372e3fSPaul Mullowney }; 2381aa372e3fSPaul Mullowney 23829371c9d4SSatish Balay struct VecCUDAEquals { 23837e8381f9SStefano Zampini template <typename Tuple> 2384*d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2385*d71ae5a4SJacob Faibussowitsch { 23867e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 23877e8381f9SStefano Zampini } 23887e8381f9SStefano Zampini }; 23897e8381f9SStefano Zampini 23909371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2391e6e9a74fSStefano Zampini template <typename Tuple> 2392*d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2393*d71ae5a4SJacob Faibussowitsch { 2394e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2395e6e9a74fSStefano Zampini } 2396e6e9a74fSStefano Zampini }; 2397e6e9a74fSStefano Zampini 2398afb2bd1cSJunchao Zhang struct MatMatCusparse { 2399ccdfe979SStefano Zampini PetscBool cisdense; 2400ccdfe979SStefano Zampini PetscScalar *Bt; 2401ccdfe979SStefano Zampini Mat X; 2402fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2403fcdce8c4SStefano Zampini PetscLogDouble flops; 2404fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2405b4285af6SJunchao Zhang 2406afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2407fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2408afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2409afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2410afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2411afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2412b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2413b4285af6SJunchao Zhang void *dBuffer4; 2414b4285af6SJunchao Zhang void *dBuffer5; 2415b4285af6SJunchao Zhang #endif 2416fcdce8c4SStefano Zampini size_t mmBufferSize; 2417fcdce8c4SStefano Zampini void *mmBuffer; 2418fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2419fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2420afb2bd1cSJunchao Zhang #endif 2421afb2bd1cSJunchao Zhang }; 2422ccdfe979SStefano Zampini 2423*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2424*d71ae5a4SJacob Faibussowitsch { 2425ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2426ccdfe979SStefano Zampini 2427ccdfe979SStefano Zampini PetscFunctionBegin; 24289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2429fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2430afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 24319566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 24329566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 24339566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 24349566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2435b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 24369566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 24379566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2438b4285af6SJunchao Zhang #endif 24399566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 24409566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2441afb2bd1cSJunchao Zhang #endif 24429566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 24439566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2444ccdfe979SStefano Zampini PetscFunctionReturn(0); 2445ccdfe979SStefano Zampini } 2446ccdfe979SStefano Zampini 2447ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2448ccdfe979SStefano Zampini 2449*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2450*d71ae5a4SJacob Faibussowitsch { 2451ccdfe979SStefano Zampini Mat_Product *product = C->product; 2452ccdfe979SStefano Zampini Mat A, B; 2453afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2454ccdfe979SStefano Zampini PetscBool flg, biscuda; 2455ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2456ccdfe979SStefano Zampini cusparseStatus_t stat; 2457ccdfe979SStefano Zampini cusparseOperation_t opA; 2458ccdfe979SStefano Zampini const PetscScalar *barray; 2459ccdfe979SStefano Zampini PetscScalar *carray; 2460ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2461ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2462ccdfe979SStefano Zampini CsrMatrix *csrmat; 2463ccdfe979SStefano Zampini 2464ccdfe979SStefano Zampini PetscFunctionBegin; 2465ccdfe979SStefano Zampini MatCheckProduct(C, 1); 246628b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2467ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2468ccdfe979SStefano Zampini A = product->A; 2469ccdfe979SStefano Zampini B = product->B; 24709566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 247128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2472ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2473ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 247428b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 24759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2476ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2477ccdfe979SStefano Zampini switch (product->type) { 2478ccdfe979SStefano Zampini case MATPRODUCT_AB: 2479ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2480ccdfe979SStefano Zampini mat = cusp->mat; 2481ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2482ccdfe979SStefano Zampini m = A->rmap->n; 2483ccdfe979SStefano Zampini n = B->cmap->n; 2484ccdfe979SStefano Zampini break; 2485ccdfe979SStefano Zampini case MATPRODUCT_AtB: 24861a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2487e6e9a74fSStefano Zampini mat = cusp->mat; 2488e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2489e6e9a74fSStefano Zampini } else { 24909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2491ccdfe979SStefano Zampini mat = cusp->matTranspose; 2492ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2493e6e9a74fSStefano Zampini } 2494ccdfe979SStefano Zampini m = A->cmap->n; 2495ccdfe979SStefano Zampini n = B->cmap->n; 2496ccdfe979SStefano Zampini break; 2497ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2498ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2499ccdfe979SStefano Zampini mat = cusp->mat; 2500ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2501ccdfe979SStefano Zampini m = A->rmap->n; 2502ccdfe979SStefano Zampini n = B->rmap->n; 2503ccdfe979SStefano Zampini break; 2504*d71ae5a4SJacob Faibussowitsch default: 2505*d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2506ccdfe979SStefano Zampini } 250728b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2508ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2509ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 25109566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 25119566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 25129566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2513afb2bd1cSJunchao Zhang 25149566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2515c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 25169566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 25179566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2518c8378d12SStefano Zampini } else { 25199566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 25209566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2521c8378d12SStefano Zampini } 2522c8378d12SStefano Zampini 25239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2524afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2525afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2526a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2527afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2528fcdce8c4SStefano Zampini size_t mmBufferSize; 25299371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 25309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 25319371c9d4SSatish Balay mmdata->matBDescr = NULL; 25329371c9d4SSatish Balay } 2533afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 25349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2535afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2536afb2bd1cSJunchao Zhang } 2537c8378d12SStefano Zampini 25389371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 25399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 25409371c9d4SSatish Balay mmdata->matCDescr = NULL; 25419371c9d4SSatish Balay } 2542afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 25439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2544afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2545afb2bd1cSJunchao Zhang } 2546afb2bd1cSJunchao Zhang 2547afb2bd1cSJunchao Zhang if (!mat->matDescr) { 25489371c9d4SSatish Balay stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 25499371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 25509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2551afb2bd1cSJunchao Zhang } 25529371c9d4SSatish Balay stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 25539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2554fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 25559566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 25569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2557fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2558fcdce8c4SStefano Zampini } 2559afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2560afb2bd1cSJunchao Zhang } else { 2561afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 25629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 25639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 25649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2565afb2bd1cSJunchao Zhang } 2566afb2bd1cSJunchao Zhang 2567afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 25689371c9d4SSatish Balay stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 25699371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2570afb2bd1cSJunchao Zhang #else 2571afb2bd1cSJunchao Zhang PetscInt k; 2572afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2573ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2574ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2575ccdfe979SStefano Zampini cublasStatus_t cerr; 2576ccdfe979SStefano Zampini 25779566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 25789371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 25799371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2580ccdfe979SStefano Zampini blda = B->cmap->n; 2581afb2bd1cSJunchao Zhang k = B->cmap->n; 2582afb2bd1cSJunchao Zhang } else { 2583afb2bd1cSJunchao Zhang k = B->rmap->n; 2584ccdfe979SStefano Zampini } 2585ccdfe979SStefano Zampini 2586afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 25879371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 25889371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2589afb2bd1cSJunchao Zhang #endif 25909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 25919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 25929566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2593ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 25949566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25959566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2596ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 25979566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25989566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2599ccdfe979SStefano Zampini } else { 26009566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2601ccdfe979SStefano Zampini } 260248a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 260348a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2604ccdfe979SStefano Zampini PetscFunctionReturn(0); 2605ccdfe979SStefano Zampini } 2606ccdfe979SStefano Zampini 2607*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2608*d71ae5a4SJacob Faibussowitsch { 2609ccdfe979SStefano Zampini Mat_Product *product = C->product; 2610ccdfe979SStefano Zampini Mat A, B; 2611ccdfe979SStefano Zampini PetscInt m, n; 2612ccdfe979SStefano Zampini PetscBool cisdense, flg; 2613ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2614ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2615ccdfe979SStefano Zampini 2616ccdfe979SStefano Zampini PetscFunctionBegin; 2617ccdfe979SStefano Zampini MatCheckProduct(C, 1); 261828b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2619ccdfe979SStefano Zampini A = product->A; 2620ccdfe979SStefano Zampini B = product->B; 26219566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 262228b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2623ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 262408401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2625ccdfe979SStefano Zampini switch (product->type) { 2626ccdfe979SStefano Zampini case MATPRODUCT_AB: 2627ccdfe979SStefano Zampini m = A->rmap->n; 2628ccdfe979SStefano Zampini n = B->cmap->n; 2629ccdfe979SStefano Zampini break; 2630ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2631ccdfe979SStefano Zampini m = A->cmap->n; 2632ccdfe979SStefano Zampini n = B->cmap->n; 2633ccdfe979SStefano Zampini break; 2634ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2635ccdfe979SStefano Zampini m = A->rmap->n; 2636ccdfe979SStefano Zampini n = B->rmap->n; 2637ccdfe979SStefano Zampini break; 2638ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2639ccdfe979SStefano Zampini m = B->cmap->n; 2640ccdfe979SStefano Zampini n = B->cmap->n; 2641ccdfe979SStefano Zampini break; 2642ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2643ccdfe979SStefano Zampini m = B->rmap->n; 2644ccdfe979SStefano Zampini n = B->rmap->n; 2645ccdfe979SStefano Zampini break; 2646*d71ae5a4SJacob Faibussowitsch default: 2647*d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2648ccdfe979SStefano Zampini } 26499566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2650ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 26519566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 26529566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2653ccdfe979SStefano Zampini 2654ccdfe979SStefano Zampini /* product data */ 26559566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2656ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2657afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2658afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 265948a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2660afb2bd1cSJunchao Zhang #endif 2661ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2662ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 26639566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 26649566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2665ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 26669566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2667ccdfe979SStefano Zampini } else { 26689566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2669ccdfe979SStefano Zampini } 2670ccdfe979SStefano Zampini } 2671ccdfe979SStefano Zampini C->product->data = mmdata; 2672ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2673ccdfe979SStefano Zampini 2674ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2675ccdfe979SStefano Zampini PetscFunctionReturn(0); 2676ccdfe979SStefano Zampini } 2677ccdfe979SStefano Zampini 2678*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2679*d71ae5a4SJacob Faibussowitsch { 2680ccdfe979SStefano Zampini Mat_Product *product = C->product; 2681fcdce8c4SStefano Zampini Mat A, B; 2682fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2683fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2684fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2685fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2686fcdce8c4SStefano Zampini PetscBool flg; 2687fcdce8c4SStefano Zampini cusparseStatus_t stat; 2688fcdce8c4SStefano Zampini MatProductType ptype; 2689fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2690fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2691fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2692fcdce8c4SStefano Zampini #endif 2693b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2694ccdfe979SStefano Zampini 2695ccdfe979SStefano Zampini PetscFunctionBegin; 2696ccdfe979SStefano Zampini MatCheckProduct(C, 1); 269728b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 26989566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 269928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2700fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2701fcdce8c4SStefano Zampini A = product->A; 2702fcdce8c4SStefano Zampini B = product->B; 2703fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2704fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2705fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 270608401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2707fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 270828b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2709fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 271028b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2711fcdce8c4SStefano Zampini goto finalize; 2712fcdce8c4SStefano Zampini } 2713fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 27149566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 271528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 27169566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 271728b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 271828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 271928b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2720fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2721fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2722fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 272308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 272408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 272508401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 27269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 27279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2728fcdce8c4SStefano Zampini 2729fcdce8c4SStefano Zampini ptype = product->type; 2730b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2731fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 273228b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2733fa046f9fSJunchao Zhang } 2734b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2735fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 273628b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2737fa046f9fSJunchao Zhang } 2738fcdce8c4SStefano Zampini switch (ptype) { 2739fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2740fcdce8c4SStefano Zampini Amat = Acusp->mat; 2741fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2742fcdce8c4SStefano Zampini break; 2743fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2744fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2745fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2746fcdce8c4SStefano Zampini break; 2747fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2748fcdce8c4SStefano Zampini Amat = Acusp->mat; 2749fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2750fcdce8c4SStefano Zampini break; 2751*d71ae5a4SJacob Faibussowitsch default: 2752*d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2753fcdce8c4SStefano Zampini } 2754fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 275528b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 275628b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 275728b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2758fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2759fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2760fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 276128b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 276228b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 276328b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 27649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2765fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2766fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 27679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2768b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 27699371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 27709371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2771b4285af6SJunchao Zhang #else 27729371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 27739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 27749371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 27759371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2776b4285af6SJunchao Zhang #endif 2777fcdce8c4SStefano Zampini #else 27789371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 27799371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 27809371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2781fcdce8c4SStefano Zampini #endif 27829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27839566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 27849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2785fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2786fcdce8c4SStefano Zampini finalize: 2787fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 27889566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 27899566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 27909566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2791fcdce8c4SStefano Zampini c->reallocs = 0; 2792fcdce8c4SStefano Zampini C->info.mallocs += 0; 2793fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2794fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2795fcdce8c4SStefano Zampini C->num_ass++; 2796ccdfe979SStefano Zampini PetscFunctionReturn(0); 2797ccdfe979SStefano Zampini } 2798fcdce8c4SStefano Zampini 2799*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2800*d71ae5a4SJacob Faibussowitsch { 2801fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2802fcdce8c4SStefano Zampini Mat A, B; 2803fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2804fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 2805fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2806fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2807fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 2808fcdce8c4SStefano Zampini PetscBool flg; 2809fcdce8c4SStefano Zampini cusparseStatus_t stat; 2810fcdce8c4SStefano Zampini MatProductType ptype; 2811fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2812fcdce8c4SStefano Zampini PetscLogDouble flops; 2813fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 2814fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2815fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2816fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2817fcdce8c4SStefano Zampini #else 2818fcdce8c4SStefano Zampini int cnz; 2819fcdce8c4SStefano Zampini #endif 2820b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2821fcdce8c4SStefano Zampini 2822fcdce8c4SStefano Zampini PetscFunctionBegin; 2823fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 282428b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2825fcdce8c4SStefano Zampini A = product->A; 2826fcdce8c4SStefano Zampini B = product->B; 28279566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 282828b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 28299566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 283028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2831fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 2832fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 2833fcdce8c4SStefano Zampini /* product data */ 28349566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2835fcdce8c4SStefano Zampini C->product->data = mmdata; 2836fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2837fcdce8c4SStefano Zampini 28389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 28399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2840d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2841d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 284208401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 284308401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2844d60bce21SJunchao Zhang 2845fcdce8c4SStefano Zampini ptype = product->type; 2846b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2847fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2848fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2849fa046f9fSJunchao Zhang } 2850b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2851fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2852fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2853fa046f9fSJunchao Zhang } 2854fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2855fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2856fcdce8c4SStefano Zampini switch (ptype) { 2857fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2858fcdce8c4SStefano Zampini m = A->rmap->n; 2859fcdce8c4SStefano Zampini n = B->cmap->n; 2860fcdce8c4SStefano Zampini k = A->cmap->n; 2861fcdce8c4SStefano Zampini Amat = Acusp->mat; 2862fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2863fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2864fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2865fcdce8c4SStefano Zampini break; 2866fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2867fcdce8c4SStefano Zampini m = A->cmap->n; 2868fcdce8c4SStefano Zampini n = B->cmap->n; 2869fcdce8c4SStefano Zampini k = A->rmap->n; 28709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2871fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2872fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2873fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2874fcdce8c4SStefano Zampini break; 2875fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2876fcdce8c4SStefano Zampini m = A->rmap->n; 2877fcdce8c4SStefano Zampini n = B->rmap->n; 2878fcdce8c4SStefano Zampini k = A->cmap->n; 28799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2880fcdce8c4SStefano Zampini Amat = Acusp->mat; 2881fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2882fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2883fcdce8c4SStefano Zampini break; 2884*d71ae5a4SJacob Faibussowitsch default: 2885*d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2886fcdce8c4SStefano Zampini } 2887fcdce8c4SStefano Zampini 2888fcdce8c4SStefano Zampini /* create cusparse matrix */ 28899566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 28909566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2891fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 2892fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2893fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2894fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2895fcdce8c4SStefano Zampini 2896fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2897fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2898fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 28999566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 29009566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2901fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2902fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2903fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2904fcdce8c4SStefano Zampini } else { 2905fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2906fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2907fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2908fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2909fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2910fcdce8c4SStefano Zampini } 2911fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2912fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2913fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2914fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2915fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2916fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 29179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 29189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 29199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 29209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 29219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 29229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 29239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 29249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 29259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2926fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2927fcdce8c4SStefano Zampini thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2928fcdce8c4SStefano Zampini c->nz = 0; 2929fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2930fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2931fcdce8c4SStefano Zampini goto finalizesym; 2932fcdce8c4SStefano Zampini } 2933fcdce8c4SStefano Zampini 293428b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 293528b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2936fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2937fcdce8c4SStefano Zampini if (!biscompressed) { 2938fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 2939fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2940fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2941fcdce8c4SStefano Zampini #endif 2942fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2943fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2944fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2945fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2946fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2947fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2948fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2949fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2950fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2951fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2952fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 29539566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2954fcdce8c4SStefano Zampini } 2955fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2956fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2957fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2958fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 29599371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 29609371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2961fcdce8c4SStefano Zampini } 2962fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2963fcdce8c4SStefano Zampini #endif 2964fcdce8c4SStefano Zampini } 296528b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 296628b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2967fcdce8c4SStefano Zampini /* precompute flops count */ 2968fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2969fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2970fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2971fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 2972fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 2973fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2974fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 2975fcdce8c4SStefano Zampini } 2976fcdce8c4SStefano Zampini } 2977fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2978fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2979fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 2980fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2981fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 2982fcdce8c4SStefano Zampini } 2983fcdce8c4SStefano Zampini } else { /* TODO */ 2984fcdce8c4SStefano Zampini flops = 0.; 2985fcdce8c4SStefano Zampini } 2986fcdce8c4SStefano Zampini 2987fcdce8c4SStefano Zampini mmdata->flops = flops; 29889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2989b4285af6SJunchao Zhang 2990fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 29929371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 29939371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2995b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2996b4285af6SJunchao Zhang { 2997b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2998b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2999b4285af6SJunchao Zhang */ 3000b4285af6SJunchao Zhang void *dBuffer1 = NULL; 3001b4285af6SJunchao Zhang void *dBuffer2 = NULL; 3002b4285af6SJunchao Zhang void *dBuffer3 = NULL; 3003b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3004b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3005b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3006b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3007b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3008b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3009b4285af6SJunchao Zhang 3010b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3011b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 30129371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 30139371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3015b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 30169371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 30179371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3018b4285af6SJunchao Zhang 3019b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 30209371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 30219371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 30239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 30249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 30259371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 30269371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30279566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 30289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3029b4285af6SJunchao Zhang 3030b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 3031b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 30329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3033b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 3034b4285af6SJunchao Zhang /* allocate matrix C */ 30359371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30369371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 30379371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 30389371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3039b4285af6SJunchao Zhang /* update matC with the new pointers */ 30409371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 30419371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3042b4285af6SJunchao Zhang 3043b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 30449371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 30459371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 30479371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 30489371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30499566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 30509371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30519371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30529566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3053b4285af6SJunchao Zhang } 3054ae37ee31SJunchao Zhang #else 3055b4285af6SJunchao Zhang size_t bufSize2; 3056fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 30579371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 30589371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3060fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 30619371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 30629371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3063fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 30649371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 30659371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3066fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3067fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3068fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3069fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3070fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 30719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3072fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 30739371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 30749371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3075fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 30769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3077fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 30789371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 30799371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3080fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30819566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3082fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30839566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 30849371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 30859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30869371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30879371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3088ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3089fcdce8c4SStefano Zampini #else 30909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 30919371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30929371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 30939371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3094fcdce8c4SStefano Zampini c->nz = cnz; 3095fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30969566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3097fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30989566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3099fcdce8c4SStefano Zampini 31009566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3101fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3102fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3103fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 31049371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 31059371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 31069371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3107fcdce8c4SStefano Zampini #endif 31089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 31099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3110fcdce8c4SStefano Zampini finalizesym: 3111fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3112fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3113fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 31149566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 31159566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 3116fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3117fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3118fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3119fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3120fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3121fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3122fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 31239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 31249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3125fcdce8c4SStefano Zampini } else { 3126fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3127fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 31289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 31299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3130fcdce8c4SStefano Zampini } 3131fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3132fcdce8c4SStefano Zampini PetscInt r = 0; 3133fcdce8c4SStefano Zampini c->i[0] = 0; 3134fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3135fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3136fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3137fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3138fcdce8c4SStefano Zampini } 3139fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3140fcdce8c4SStefano Zampini } 31419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 31429566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 31439566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3144fcdce8c4SStefano Zampini c->maxnz = c->nz; 3145fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3146fcdce8c4SStefano Zampini c->rmax = 0; 3147fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3148fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3149fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3150fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 3151fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3152fcdce8c4SStefano Zampini } 31539566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 31549566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3155fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3156fcdce8c4SStefano Zampini 3157fcdce8c4SStefano Zampini C->nonzerostate++; 31589566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 31599566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3160fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3161fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3162fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3163fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3164fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3165abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3166fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3167fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3168fcdce8c4SStefano Zampini } 3169fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3170fcdce8c4SStefano Zampini PetscFunctionReturn(0); 3171fcdce8c4SStefano Zampini } 3172fcdce8c4SStefano Zampini 3173fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3174fcdce8c4SStefano Zampini 3175fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3176*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3177*d71ae5a4SJacob Faibussowitsch { 3178fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3179fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3180fcdce8c4SStefano Zampini 3181fcdce8c4SStefano Zampini PetscFunctionBegin; 3182fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 31839566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 318448a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3185fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3186fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 318748a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3188fcdce8c4SStefano Zampini } 318965e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 319065e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 319165e4b4d4SStefano Zampini switch (product->type) { 319265e4b4d4SStefano Zampini case MATPRODUCT_AB: 319365e4b4d4SStefano Zampini if (product->api_user) { 3194d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 31959566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3196d0609cedSBarry Smith PetscOptionsEnd(); 319765e4b4d4SStefano Zampini } else { 3198d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 31999566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3200d0609cedSBarry Smith PetscOptionsEnd(); 320165e4b4d4SStefano Zampini } 320265e4b4d4SStefano Zampini break; 320365e4b4d4SStefano Zampini case MATPRODUCT_AtB: 320465e4b4d4SStefano Zampini if (product->api_user) { 3205d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 32069566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3207d0609cedSBarry Smith PetscOptionsEnd(); 320865e4b4d4SStefano Zampini } else { 3209d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 32109566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3211d0609cedSBarry Smith PetscOptionsEnd(); 321265e4b4d4SStefano Zampini } 321365e4b4d4SStefano Zampini break; 321465e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 321565e4b4d4SStefano Zampini if (product->api_user) { 3216d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 32179566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3218d0609cedSBarry Smith PetscOptionsEnd(); 321965e4b4d4SStefano Zampini } else { 3220d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 32219566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3222d0609cedSBarry Smith PetscOptionsEnd(); 322365e4b4d4SStefano Zampini } 322465e4b4d4SStefano Zampini break; 322565e4b4d4SStefano Zampini case MATPRODUCT_RARt: 322665e4b4d4SStefano Zampini if (product->api_user) { 3227d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 32289566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3229d0609cedSBarry Smith PetscOptionsEnd(); 323065e4b4d4SStefano Zampini } else { 3231d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 32329566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3233d0609cedSBarry Smith PetscOptionsEnd(); 323465e4b4d4SStefano Zampini } 323565e4b4d4SStefano Zampini break; 323665e4b4d4SStefano Zampini case MATPRODUCT_ABC: 323765e4b4d4SStefano Zampini if (product->api_user) { 3238d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 32399566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3240d0609cedSBarry Smith PetscOptionsEnd(); 324165e4b4d4SStefano Zampini } else { 3242d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 32439566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3244d0609cedSBarry Smith PetscOptionsEnd(); 324565e4b4d4SStefano Zampini } 324665e4b4d4SStefano Zampini break; 3247*d71ae5a4SJacob Faibussowitsch default: 3248*d71ae5a4SJacob Faibussowitsch break; 324965e4b4d4SStefano Zampini } 325065e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 325165e4b4d4SStefano Zampini } 325265e4b4d4SStefano Zampini /* dispatch */ 3253fcdce8c4SStefano Zampini if (isdense) { 3254ccdfe979SStefano Zampini switch (product->type) { 3255ccdfe979SStefano Zampini case MATPRODUCT_AB: 3256ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3257ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3258ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3259ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3260fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 32619566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3262fcdce8c4SStefano Zampini } else { 3263fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3264fcdce8c4SStefano Zampini } 3265fcdce8c4SStefano Zampini break; 3266*d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3267*d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3268*d71ae5a4SJacob Faibussowitsch break; 3269*d71ae5a4SJacob Faibussowitsch default: 3270*d71ae5a4SJacob Faibussowitsch break; 3271ccdfe979SStefano Zampini } 3272fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3273fcdce8c4SStefano Zampini switch (product->type) { 3274fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3275fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3276*d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 3277*d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3278*d71ae5a4SJacob Faibussowitsch break; 3279fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3280fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3281*d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3282*d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3283*d71ae5a4SJacob Faibussowitsch break; 3284*d71ae5a4SJacob Faibussowitsch default: 3285*d71ae5a4SJacob Faibussowitsch break; 3286fcdce8c4SStefano Zampini } 3287fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 32889566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3289fcdce8c4SStefano Zampini } 3290ccdfe979SStefano Zampini PetscFunctionReturn(0); 3291ccdfe979SStefano Zampini } 3292ccdfe979SStefano Zampini 3293*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3294*d71ae5a4SJacob Faibussowitsch { 32959ae82921SPaul Mullowney PetscFunctionBegin; 32969566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3297e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3298e6e9a74fSStefano Zampini } 3299e6e9a74fSStefano Zampini 3300*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3301*d71ae5a4SJacob Faibussowitsch { 3302e6e9a74fSStefano Zampini PetscFunctionBegin; 33039566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3304e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3305e6e9a74fSStefano Zampini } 3306e6e9a74fSStefano Zampini 3307*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3308*d71ae5a4SJacob Faibussowitsch { 3309e6e9a74fSStefano Zampini PetscFunctionBegin; 33109566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3311e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3312e6e9a74fSStefano Zampini } 3313e6e9a74fSStefano Zampini 3314*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3315*d71ae5a4SJacob Faibussowitsch { 3316e6e9a74fSStefano Zampini PetscFunctionBegin; 33179566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 33189ae82921SPaul Mullowney PetscFunctionReturn(0); 33199ae82921SPaul Mullowney } 33209ae82921SPaul Mullowney 3321*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3322*d71ae5a4SJacob Faibussowitsch { 3323ca45077fSPaul Mullowney PetscFunctionBegin; 33249566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3325ca45077fSPaul Mullowney PetscFunctionReturn(0); 3326ca45077fSPaul Mullowney } 3327ca45077fSPaul Mullowney 3328*d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3329*d71ae5a4SJacob Faibussowitsch { 3330a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3331a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3332a0e72f99SJunchao Zhang } 3333a0e72f99SJunchao Zhang 3334afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3335*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3336*d71ae5a4SJacob Faibussowitsch { 33379ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3338aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 33399ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3340e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3341e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3342e6e9a74fSStefano Zampini PetscBool compressed; 3343afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3344afb2bd1cSJunchao Zhang PetscInt nx, ny; 3345afb2bd1cSJunchao Zhang #endif 33466e111a19SKarl Rupp 33479ae82921SPaul Mullowney PetscFunctionBegin; 334808401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3349cbc6b225SStefano Zampini if (!a->nz) { 33509566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 33519566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3352e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3353e6e9a74fSStefano Zampini } 335434d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 33559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3356e6e9a74fSStefano Zampini if (!trans) { 33579ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 33585f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3359e6e9a74fSStefano Zampini } else { 33601a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3361e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3362e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3363e6e9a74fSStefano Zampini } else { 33649566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3365e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3366e6e9a74fSStefano Zampini } 3367e6e9a74fSStefano Zampini } 3368e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3369e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3370213423ffSJunchao Zhang 3371e6e9a74fSStefano Zampini try { 33729566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 33739566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 33749566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3375afb2bd1cSJunchao Zhang 33769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3377e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3378afb2bd1cSJunchao Zhang /* z = A x + beta y. 3379afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3380afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3381afb2bd1cSJunchao Zhang */ 3382e6e9a74fSStefano Zampini xptr = xarray; 3383afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3384213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3385afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3386afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3387afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3388afb2bd1cSJunchao Zhang */ 3389afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3390afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3391afb2bd1cSJunchao Zhang nx = mat->num_cols; 3392afb2bd1cSJunchao Zhang ny = mat->num_rows; 3393afb2bd1cSJunchao Zhang } 3394afb2bd1cSJunchao Zhang #endif 3395e6e9a74fSStefano Zampini } else { 3396afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3397afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3398afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3399afb2bd1cSJunchao Zhang */ 3400afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3401e6e9a74fSStefano Zampini dptr = zarray; 3402e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3403afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3404e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3405d0967f54SJacob Faibussowitsch 3406d0967f54SJacob Faibussowitsch thrust::for_each( 3407d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3408d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3409d0967f54SJacob Faibussowitsch #endif 3410d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 34119371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3412e6e9a74fSStefano Zampini } 3413afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3414afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3415afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3416afb2bd1cSJunchao Zhang nx = mat->num_rows; 3417afb2bd1cSJunchao Zhang ny = mat->num_cols; 3418afb2bd1cSJunchao Zhang } 3419afb2bd1cSJunchao Zhang #endif 3420e6e9a74fSStefano Zampini } 34219ae82921SPaul Mullowney 3422afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3423aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3424afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 34255f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3426afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 34279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 34289566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 34299371c9d4SSatish Balay PetscCallCUSPARSE( 34309371c9d4SSatish Balay cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 34319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3432afb2bd1cSJunchao Zhang 3433afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3434afb2bd1cSJunchao Zhang } else { 3435afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 34369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 34379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3438afb2bd1cSJunchao Zhang } 3439afb2bd1cSJunchao Zhang 34409371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 34419371c9d4SSatish Balay matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3442afb2bd1cSJunchao Zhang #else 34437656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 34449371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3445afb2bd1cSJunchao Zhang #endif 3446aa372e3fSPaul Mullowney } else { 3447213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3448afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3449afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3450afb2bd1cSJunchao Zhang #else 3451301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 34529371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3453afb2bd1cSJunchao Zhang #endif 3454a65300a6SPaul Mullowney } 3455aa372e3fSPaul Mullowney } 34569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3457aa372e3fSPaul Mullowney 3458e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3459213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3460213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 34619566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3462e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 34639566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 34647656d835SStefano Zampini } 3465213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 34669566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz, 0)); 34677656d835SStefano Zampini } 34687656d835SStefano Zampini 3469213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3470213423ffSJunchao Zhang if (compressed) { 34719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3472a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3473a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3474a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3475a0e72f99SJunchao Zhang */ 3476a0e72f99SJunchao Zhang #if 0 3477a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3478a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3479a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3480e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3481c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3482a0e72f99SJunchao Zhang #else 3483a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3484a0e72f99SJunchao Zhang ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3485a0e72f99SJunchao Zhang #endif 34869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3487e6e9a74fSStefano Zampini } 3488e6e9a74fSStefano Zampini } else { 34899371c9d4SSatish Balay if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3490e6e9a74fSStefano Zampini } 34919566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 34929566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 34939566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3494*d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 3495*d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3496*d71ae5a4SJacob Faibussowitsch } 3497e6e9a74fSStefano Zampini if (yy) { 34989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3499e6e9a74fSStefano Zampini } else { 35009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3501e6e9a74fSStefano Zampini } 35029ae82921SPaul Mullowney PetscFunctionReturn(0); 35039ae82921SPaul Mullowney } 35049ae82921SPaul Mullowney 3505*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3506*d71ae5a4SJacob Faibussowitsch { 3507ca45077fSPaul Mullowney PetscFunctionBegin; 35089566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3509ca45077fSPaul Mullowney PetscFunctionReturn(0); 3510ca45077fSPaul Mullowney } 3511ca45077fSPaul Mullowney 3512*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3513*d71ae5a4SJacob Faibussowitsch { 3514042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3515042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 35163fa6b06aSMark Adams 3517042217e8SBarry Smith PetscFunctionBegin; 35189566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3519042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 35209566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 35219566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3522042217e8SBarry Smith cusp->deviceMat = NULL; 3523042217e8SBarry Smith } 35249ae82921SPaul Mullowney PetscFunctionReturn(0); 35259ae82921SPaul Mullowney } 35269ae82921SPaul Mullowney 35279ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3528e057df02SPaul Mullowney /*@ 352911a5261eSBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3530e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 353111a5261eSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3532e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3533e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3534e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 35359ae82921SPaul Mullowney 3536d083f849SBarry Smith Collective 35379ae82921SPaul Mullowney 35389ae82921SPaul Mullowney Input Parameters: 353911a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 35409ae82921SPaul Mullowney . m - number of rows 35419ae82921SPaul Mullowney . n - number of columns 35429ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 35439ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 35440298fd71SBarry Smith (possibly different for each row) or NULL 35459ae82921SPaul Mullowney 35469ae82921SPaul Mullowney Output Parameter: 35479ae82921SPaul Mullowney . A - the matrix 35489ae82921SPaul Mullowney 354911a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 35509ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 355111a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 35529ae82921SPaul Mullowney 35539ae82921SPaul Mullowney Notes: 35549ae82921SPaul Mullowney If nnz is given then nz is ignored 35559ae82921SPaul Mullowney 355611a5261eSBarry Smith The AIJ format, also called 355711a5261eSBarry Smith compressed row storage, is fully compatible with standard Fortran 77 35589ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 35599ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 35609ae82921SPaul Mullowney 35619ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 356211a5261eSBarry Smith Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 35639ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 35649ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 35659ae82921SPaul Mullowney 35669ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 35679ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 35689ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 35699ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 35709ae82921SPaul Mullowney 35719ae82921SPaul Mullowney Level: intermediate 35729ae82921SPaul Mullowney 357311a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 35749ae82921SPaul Mullowney @*/ 3575*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3576*d71ae5a4SJacob Faibussowitsch { 35779ae82921SPaul Mullowney PetscFunctionBegin; 35789566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 35799566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 35809566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 35819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 35829ae82921SPaul Mullowney PetscFunctionReturn(0); 35839ae82921SPaul Mullowney } 35849ae82921SPaul Mullowney 3585*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3586*d71ae5a4SJacob Faibussowitsch { 35879ae82921SPaul Mullowney PetscFunctionBegin; 35889ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 35899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 35909ae82921SPaul Mullowney } else { 35919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3592aa372e3fSPaul Mullowney } 35939566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 35949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 35959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 35969566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 35979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 35989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 35999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 36009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 36019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 36029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 36039566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 36049ae82921SPaul Mullowney PetscFunctionReturn(0); 36059ae82921SPaul Mullowney } 36069ae82921SPaul Mullowney 3607ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 360895639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3609*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3610*d71ae5a4SJacob Faibussowitsch { 36119ff858a8SKarl Rupp PetscFunctionBegin; 36129566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 36139566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 36149ff858a8SKarl Rupp PetscFunctionReturn(0); 36159ff858a8SKarl Rupp } 36169ff858a8SKarl Rupp 3617*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3618*d71ae5a4SJacob Faibussowitsch { 3619a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3620039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3621039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3622039c6fbaSStefano Zampini PetscScalar *ay; 3623039c6fbaSStefano Zampini const PetscScalar *ax; 3624039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3625e6e9a74fSStefano Zampini 362695639643SRichard Tran Mills PetscFunctionBegin; 3627a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3628a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3629039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 36309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 36319566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3632a587d139SMark PetscFunctionReturn(0); 363395639643SRichard Tran Mills } 3634039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 36359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 36369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 36375f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 36385f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3639039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3640039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3641039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3642039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3643039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3644ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3645039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3646039c6fbaSStefano Zampini } 3647d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3648d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3649039c6fbaSStefano Zampini 3650039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3651039c6fbaSStefano Zampini PetscScalar b = 1.0; 3652039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3653039c6fbaSStefano Zampini size_t bufferSize; 3654039c6fbaSStefano Zampini void *buffer; 3655039c6fbaSStefano Zampini #endif 3656039c6fbaSStefano Zampini 36579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 36589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 36599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3660039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 36619371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 36629371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 36639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 36649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 36659371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 36669371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 36679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 36689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 36699566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3670039c6fbaSStefano Zampini #else 36719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 36729371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 36739371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 36749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 36759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3676039c6fbaSStefano Zampini #endif 36779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 36789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 36799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 36809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3681039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3682a587d139SMark cublasHandle_t cublasv2handle; 3683a587d139SMark PetscBLASInt one = 1, bnz = 1; 3684039c6fbaSStefano Zampini 36859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 36869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 36879566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 36889566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 36899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 36909566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 36919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 36929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 36939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 36949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 36959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3696039c6fbaSStefano Zampini } else { 36979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 36989566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3699a587d139SMark } 370095639643SRichard Tran Mills PetscFunctionReturn(0); 370195639643SRichard Tran Mills } 370295639643SRichard Tran Mills 3703*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3704*d71ae5a4SJacob Faibussowitsch { 370533c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 370633c9ba73SStefano Zampini PetscScalar *ay; 370733c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 370833c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 370933c9ba73SStefano Zampini 371033c9ba73SStefano Zampini PetscFunctionBegin; 37119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 37129566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 37139566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 37149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 37159566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 37169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 37179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 37189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 37199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 372033c9ba73SStefano Zampini PetscFunctionReturn(0); 372133c9ba73SStefano Zampini } 372233c9ba73SStefano Zampini 3723*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3724*d71ae5a4SJacob Faibussowitsch { 37257e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3726a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 37277e8381f9SStefano Zampini 37283fa6b06aSMark Adams PetscFunctionBegin; 37293fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 37303fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 37317e8381f9SStefano Zampini if (spptr->mat) { 37327e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 37337e8381f9SStefano Zampini if (matrix->values) { 37347e8381f9SStefano Zampini both = PETSC_TRUE; 37357e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 37367e8381f9SStefano Zampini } 37377e8381f9SStefano Zampini } 37387e8381f9SStefano Zampini if (spptr->matTranspose) { 37397e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3740ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 37417e8381f9SStefano Zampini } 37423fa6b06aSMark Adams } 37439566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 37449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 37457e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3746a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 37473fa6b06aSMark Adams PetscFunctionReturn(0); 37483fa6b06aSMark Adams } 37493fa6b06aSMark Adams 3750*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3751*d71ae5a4SJacob Faibussowitsch { 3752a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3753a587d139SMark 3754a587d139SMark PetscFunctionBegin; 37559a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 37569a14fc28SStefano Zampini A->boundtocpu = flg; 37579a14fc28SStefano Zampini PetscFunctionReturn(0); 37589a14fc28SStefano Zampini } 3759a587d139SMark if (flg) { 37609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3761a587d139SMark 376233c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3763a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3764a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3765a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3766a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3767a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3768a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3769a587d139SMark A->ops->multhermitiantranspose = NULL; 3770a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3771fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 37729566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 37739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 37749566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 37759566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 37769566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 37779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 37789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3779a587d139SMark } else { 378033c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3781a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3782a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3783a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3784a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3785a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3786a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3787a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3788a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3789fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 379067a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 379167a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 379267a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 379367a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 379467a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 379567a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 37967ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 37977ee59b9bSJunchao Zhang 37989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 37999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 38009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 38019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 38029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 38039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3804a587d139SMark } 3805a587d139SMark A->boundtocpu = flg; 3806ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3807ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3808ea500dcfSRichard Tran Mills } else { 3809ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3810ea500dcfSRichard Tran Mills } 3811a587d139SMark PetscFunctionReturn(0); 3812a587d139SMark } 3813a587d139SMark 3814*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 3815*d71ae5a4SJacob Faibussowitsch { 381649735bf3SStefano Zampini Mat B; 38179ae82921SPaul Mullowney 38189ae82921SPaul Mullowney PetscFunctionBegin; 38199566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 382049735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 38219566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 382249735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 38239566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 382449735bf3SStefano Zampini } 382549735bf3SStefano Zampini B = *newmat; 382649735bf3SStefano Zampini 38279566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 38289566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 382934136279SStefano Zampini 383049735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 38319ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3832e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 38339566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 38349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 38359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 38361a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3837d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3838ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 3839a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3840a435da06SStefano Zampini #else 3841d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3842a435da06SStefano Zampini #endif 3843d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3844d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3845d8132acaSStefano Zampini #endif 38461a2c6b5cSJunchao Zhang B->spptr = spptr; 38479ae82921SPaul Mullowney } else { 3848e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3849e6e9a74fSStefano Zampini 38509566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 38519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 38529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3853e6e9a74fSStefano Zampini B->spptr = spptr; 38549ae82921SPaul Mullowney } 3855e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 385649735bf3SStefano Zampini } 3857693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 38589ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 38591a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 38609ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 386195639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3862693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 38632205254eSKarl Rupp 38649566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 38659566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 38669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3867ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 38689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3869ae48a8d0SStefano Zampini #endif 38709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 38719ae82921SPaul Mullowney PetscFunctionReturn(0); 38729ae82921SPaul Mullowney } 38739ae82921SPaul Mullowney 3874*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 3875*d71ae5a4SJacob Faibussowitsch { 387602fe1965SBarry Smith PetscFunctionBegin; 38779566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 38789566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 387902fe1965SBarry Smith PetscFunctionReturn(0); 388002fe1965SBarry Smith } 388102fe1965SBarry Smith 38823ca39a21SBarry Smith /*MC 3883e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3884e057df02SPaul Mullowney 388511a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 388611a5261eSBarry Smith CSR, ELL, or Hybrid format. 388711a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 3888e057df02SPaul Mullowney 3889e057df02SPaul Mullowney Options Database Keys: 389011a5261eSBarry Smith + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 389111a5261eSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 389211a5261eSBarry Smith - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 389311a5261eSBarry Smith + -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 3894e057df02SPaul Mullowney 3895e057df02SPaul Mullowney Level: beginner 3896e057df02SPaul Mullowney 389711a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3898e057df02SPaul Mullowney M*/ 38997f756511SDominic Meiser 3900bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 39010f39cd5aSBarry Smith 3902*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 3903*d71ae5a4SJacob Faibussowitsch { 390442c9c57cSBarry Smith PetscFunctionBegin; 39059566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 39069566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 39079566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 39089566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 39099566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3910bddcd29dSMark Adams 391142c9c57cSBarry Smith PetscFunctionReturn(0); 391242c9c57cSBarry Smith } 391329b38603SBarry Smith 3914*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 3915*d71ae5a4SJacob Faibussowitsch { 3916cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3917cbc6b225SStefano Zampini 3918cbc6b225SStefano Zampini PetscFunctionBegin; 3919cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3920cbc6b225SStefano Zampini delete cusp->cooPerm; 3921cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3922cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3923cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3924cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 39259566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 39269566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3927cbc6b225SStefano Zampini } 3928cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3929cbc6b225SStefano Zampini PetscFunctionReturn(0); 3930cbc6b225SStefano Zampini } 3931cbc6b225SStefano Zampini 3932*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 3933*d71ae5a4SJacob Faibussowitsch { 39347f756511SDominic Meiser PetscFunctionBegin; 39357f756511SDominic Meiser if (*cusparsestruct) { 39369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 39379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 39387f756511SDominic Meiser delete (*cusparsestruct)->workVector; 393981902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 39407e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 39417e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3942a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 39439566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 39449566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 39459566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 39469566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 39477f756511SDominic Meiser } 39487f756511SDominic Meiser PetscFunctionReturn(0); 39497f756511SDominic Meiser } 39507f756511SDominic Meiser 3951*d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 3952*d71ae5a4SJacob Faibussowitsch { 39537f756511SDominic Meiser PetscFunctionBegin; 39547f756511SDominic Meiser if (*mat) { 39557f756511SDominic Meiser delete (*mat)->values; 39567f756511SDominic Meiser delete (*mat)->column_indices; 39577f756511SDominic Meiser delete (*mat)->row_offsets; 39587f756511SDominic Meiser delete *mat; 39597f756511SDominic Meiser *mat = 0; 39607f756511SDominic Meiser } 39617f756511SDominic Meiser PetscFunctionReturn(0); 39627f756511SDominic Meiser } 39637f756511SDominic Meiser 3964*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 3965*d71ae5a4SJacob Faibussowitsch { 39667f756511SDominic Meiser PetscFunctionBegin; 39677f756511SDominic Meiser if (*trifactor) { 39689566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3969261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 39709566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 39719566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 39729566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3973afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 39749566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3975afb2bd1cSJunchao Zhang #endif 39769566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 39777f756511SDominic Meiser } 39787f756511SDominic Meiser PetscFunctionReturn(0); 39797f756511SDominic Meiser } 39807f756511SDominic Meiser 3981*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 3982*d71ae5a4SJacob Faibussowitsch { 39837f756511SDominic Meiser CsrMatrix *mat; 39847f756511SDominic Meiser 39857f756511SDominic Meiser PetscFunctionBegin; 39867f756511SDominic Meiser if (*matstruct) { 39877f756511SDominic Meiser if ((*matstruct)->mat) { 39887f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3989afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3990afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3991afb2bd1cSJunchao Zhang #else 39927f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 39939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3994afb2bd1cSJunchao Zhang #endif 39957f756511SDominic Meiser } else { 39967f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 39977f756511SDominic Meiser CsrMatrix_Destroy(&mat); 39987f756511SDominic Meiser } 39997f756511SDominic Meiser } 40009566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 40017f756511SDominic Meiser delete (*matstruct)->cprowIndices; 40029566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 40039566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 40049566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4005afb2bd1cSJunchao Zhang 4006afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4007afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 40089566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4009afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 4010afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 40119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 40129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 40139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4014afb2bd1cSJunchao Zhang } 4015afb2bd1cSJunchao Zhang } 4016afb2bd1cSJunchao Zhang #endif 40177f756511SDominic Meiser delete *matstruct; 40187e8381f9SStefano Zampini *matstruct = NULL; 40197f756511SDominic Meiser } 40207f756511SDominic Meiser PetscFunctionReturn(0); 40217f756511SDominic Meiser } 40227f756511SDominic Meiser 4023*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4024*d71ae5a4SJacob Faibussowitsch { 4025da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4026da112707SJunchao Zhang 40277f756511SDominic Meiser PetscFunctionBegin; 4028da112707SJunchao Zhang if (fs) { 4029da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4030da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4031da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4032da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4033da112707SJunchao Zhang delete fs->rpermIndices; 4034da112707SJunchao Zhang delete fs->cpermIndices; 4035da112707SJunchao Zhang delete fs->workVector; 4036da112707SJunchao Zhang fs->rpermIndices = NULL; 4037da112707SJunchao Zhang fs->cpermIndices = NULL; 4038da112707SJunchao Zhang fs->workVector = NULL; 4039da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4040da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4041da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4042da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 4043da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4044da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 4045da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4046da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4047da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 404812ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4049da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4050da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 405112ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4052da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4053da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4054da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4055da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4056da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4057da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4058da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4059da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4060da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4061da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4062da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4063da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 406412ba2bc6SJunchao Zhang 406512ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 406612ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4067da112707SJunchao Zhang #endif 4068ccdfe979SStefano Zampini } 4069ccdfe979SStefano Zampini PetscFunctionReturn(0); 4070ccdfe979SStefano Zampini } 4071ccdfe979SStefano Zampini 4072*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4073*d71ae5a4SJacob Faibussowitsch { 4074ccdfe979SStefano Zampini cusparseHandle_t handle; 4075ccdfe979SStefano Zampini 4076ccdfe979SStefano Zampini PetscFunctionBegin; 4077ccdfe979SStefano Zampini if (*trifactors) { 40789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 407948a46eb9SPierre Jolivet if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 40809566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 40817f756511SDominic Meiser } 40827f756511SDominic Meiser PetscFunctionReturn(0); 40837f756511SDominic Meiser } 40847e8381f9SStefano Zampini 40859371c9d4SSatish Balay struct IJCompare { 4086*d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4087*d71ae5a4SJacob Faibussowitsch { 40887e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 40897e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 40907e8381f9SStefano Zampini return false; 40917e8381f9SStefano Zampini } 40927e8381f9SStefano Zampini }; 40937e8381f9SStefano Zampini 40949371c9d4SSatish Balay struct IJEqual { 4095*d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4096*d71ae5a4SJacob Faibussowitsch { 40977e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 40987e8381f9SStefano Zampini return true; 40997e8381f9SStefano Zampini } 41007e8381f9SStefano Zampini }; 41017e8381f9SStefano Zampini 41029371c9d4SSatish Balay struct IJDiff { 41039371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 41047e8381f9SStefano Zampini }; 41057e8381f9SStefano Zampini 41069371c9d4SSatish Balay struct IJSum { 41079371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 41087e8381f9SStefano Zampini }; 41097e8381f9SStefano Zampini 41107e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 4111219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4112*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4113*d71ae5a4SJacob Faibussowitsch { 41147e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4115fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4116bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 411708391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 41187e8381f9SStefano Zampini CsrMatrix *matrix; 41197e8381f9SStefano Zampini PetscInt n; 41207e8381f9SStefano Zampini 41217e8381f9SStefano Zampini PetscFunctionBegin; 412228b400f6SJacob Faibussowitsch PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 412328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 41247e8381f9SStefano Zampini if (!cusp->cooPerm) { 41259566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 41269566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 41277e8381f9SStefano Zampini PetscFunctionReturn(0); 41287e8381f9SStefano Zampini } 41297e8381f9SStefano Zampini matrix = (CsrMatrix *)cusp->mat->mat; 413028b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4131e61fc153SStefano Zampini if (!v) { 4132e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4133e61fc153SStefano Zampini goto finalize; 41347e8381f9SStefano Zampini } 4135e61fc153SStefano Zampini n = cusp->cooPerm->size(); 413608391a17SStefano Zampini if (isCudaMem(v)) { 413708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 413808391a17SStefano Zampini } else { 4139e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4140e61fc153SStefano Zampini cooPerm_v->assign(v, v + n); 414108391a17SStefano Zampini d_v = cooPerm_v->data(); 41429566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 414308391a17SStefano Zampini } 41449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4145e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4146ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4147bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 414808391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4149ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4150ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4151ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4152ddea5d60SJunchao Zhang */ 4153e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4154e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4155e61fc153SStefano Zampini delete cooPerm_w; 41567e8381f9SStefano Zampini } else { 4157ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 41589371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 41599371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4160ddea5d60SJunchao Zhang thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 41617e8381f9SStefano Zampini } 41627e8381f9SStefano Zampini } else { 4163e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 416408391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4165e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 41667e8381f9SStefano Zampini } else { 41679371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 41689371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 41697e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 41707e8381f9SStefano Zampini } 41717e8381f9SStefano Zampini } 41729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4173e61fc153SStefano Zampini finalize: 4174e61fc153SStefano Zampini delete cooPerm_v; 41757e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 41769566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4177fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 41789566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 41799566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 41809566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4181fcdce8c4SStefano Zampini a->reallocs = 0; 4182fcdce8c4SStefano Zampini A->info.mallocs += 0; 4183fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4184fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4185fcdce8c4SStefano Zampini A->num_ass++; 41867e8381f9SStefano Zampini PetscFunctionReturn(0); 41877e8381f9SStefano Zampini } 41887e8381f9SStefano Zampini 4189*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4190*d71ae5a4SJacob Faibussowitsch { 4191a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4192a49f1ed0SStefano Zampini 4193a49f1ed0SStefano Zampini PetscFunctionBegin; 4194a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4195a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4196a49f1ed0SStefano Zampini if (destroy) { 41979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4198a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4199a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4200a49f1ed0SStefano Zampini } 42011a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 4202a49f1ed0SStefano Zampini PetscFunctionReturn(0); 4203a49f1ed0SStefano Zampini } 4204a49f1ed0SStefano Zampini 42057e8381f9SStefano Zampini #include <thrust/binary_search.h> 4206219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4207*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4208*d71ae5a4SJacob Faibussowitsch { 42097e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 42107e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 42117e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 42127e8381f9SStefano Zampini 42137e8381f9SStefano Zampini PetscFunctionBegin; 42149566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 42159566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 42167e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 42177e8381f9SStefano Zampini if (n != cooPerm_n) { 42187e8381f9SStefano Zampini delete cusp->cooPerm; 42197e8381f9SStefano Zampini delete cusp->cooPerm_a; 42207e8381f9SStefano Zampini cusp->cooPerm = NULL; 42217e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 42227e8381f9SStefano Zampini } 42237e8381f9SStefano Zampini if (n) { 4224e8729f6fSJunchao Zhang thrust::device_ptr<PetscInt> d_i, d_j; 4225e8729f6fSJunchao Zhang PetscInt *d_raw_i, *d_raw_j; 4226e8729f6fSJunchao Zhang PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4227e8729f6fSJunchao Zhang PetscMemType imtype, jmtype; 4228e8729f6fSJunchao Zhang 4229e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_i, &imtype)); 4230e8729f6fSJunchao Zhang if (PetscMemTypeHost(imtype)) { 4231e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4232e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4233e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(d_raw_i); 4234e8729f6fSJunchao Zhang free_raw_i = PETSC_TRUE; 4235e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4236e8729f6fSJunchao Zhang } else { 4237e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(coo_i); 4238e8729f6fSJunchao Zhang } 4239e8729f6fSJunchao Zhang 4240e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_j, &jmtype)); 4241e8729f6fSJunchao Zhang if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4242e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4243e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4244e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(d_raw_j); 4245e8729f6fSJunchao Zhang free_raw_j = PETSC_TRUE; 4246e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4247e8729f6fSJunchao Zhang } else { 4248e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(coo_j); 4249e8729f6fSJunchao Zhang } 4250e8729f6fSJunchao Zhang 42517e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 42527e8381f9SStefano Zampini 4253ad540459SPierre Jolivet if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4254ad540459SPierre Jolivet if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 42557e8381f9SStefano Zampini 4256ddea5d60SJunchao Zhang /* Ex. 4257ddea5d60SJunchao Zhang n = 6 4258ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4259ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4260ddea5d60SJunchao Zhang */ 4261e8729f6fSJunchao Zhang auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4262e8729f6fSJunchao Zhang auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 42637e8381f9SStefano Zampini 42649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 42657e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4266ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4267e8729f6fSJunchao Zhang (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4268e8729f6fSJunchao Zhang THRUSTINTARRAY w(d_j, d_j + n); 42697e8381f9SStefano Zampini 4270ddea5d60SJunchao Zhang /* 4271ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4272ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4273ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4274ddea5d60SJunchao Zhang */ 4275ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4276ddea5d60SJunchao Zhang 4277ddea5d60SJunchao Zhang /* 4278ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4279ddea5d60SJunchao Zhang ^ekey 4280ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4281ddea5d60SJunchao Zhang ^nekye 4282ddea5d60SJunchao Zhang */ 42837e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 42847e8381f9SStefano Zampini delete cusp->cooPerm_a; 42857e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4286ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4287ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4288ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4289ddea5d60SJunchao Zhang adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4290ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 42917e8381f9SStefano Zampini w[0] = 0; 4292ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4293ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 42947e8381f9SStefano Zampini } 42957e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4296e8729f6fSJunchao Zhang thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4297ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4298ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 42999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 43007e8381f9SStefano Zampini 43019566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 43027e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 43037e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 43047e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 43059566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4306ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 43079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43087e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4309fcdce8c4SStefano Zampini a->rmax = 0; 43109566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->a)); 43119566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->j)); 4312e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43139566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 43149566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 43157e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 43167e8381f9SStefano Zampini const PetscInt nnzr = a->i[i + 1] - a->i[i]; 43177e8381f9SStefano Zampini nzr += (PetscInt) !!(nnzr); 43187e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4319fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax, nnzr); 43207e8381f9SStefano Zampini } 4321fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 43227e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 43239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 43249566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4325e8729f6fSJunchao Zhang if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4326e8729f6fSJunchao Zhang if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 43277e8381f9SStefano Zampini } else { 43289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 43297e8381f9SStefano Zampini } 43309566063dSJacob Faibussowitsch PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 43317e8381f9SStefano Zampini 43327e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4333e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 43349566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->nz)); 43359566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 43367e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 43379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 43389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 43397e8381f9SStefano Zampini PetscFunctionReturn(0); 43407e8381f9SStefano Zampini } 4341ed502f03SStefano Zampini 4342*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4343*d71ae5a4SJacob Faibussowitsch { 4344219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4345219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4346cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4347219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4348219fbbafSJunchao Zhang 4349219fbbafSJunchao Zhang PetscFunctionBegin; 43509566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 43519566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4352219fbbafSJunchao Zhang if (coo_i) { 43539566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 4354219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4355219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { 43569371c9d4SSatish Balay if (coo_i[k] < 0 || coo_j[k] < 0) { 43579371c9d4SSatish Balay coo_basic = PETSC_FALSE; 43589371c9d4SSatish Balay break; 43599371c9d4SSatish Balay } 4360219fbbafSJunchao Zhang } 4361219fbbafSJunchao Zhang } 4362219fbbafSJunchao Zhang } 4363219fbbafSJunchao Zhang 4364219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 43659566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4366219fbbafSJunchao Zhang } else { 43679566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4368cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 43699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4370219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(mat->data); 4371219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 43729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 43739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 43759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4376219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4377219fbbafSJunchao Zhang } 4378219fbbafSJunchao Zhang PetscFunctionReturn(0); 4379219fbbafSJunchao Zhang } 4380219fbbafSJunchao Zhang 4381*d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4382*d71ae5a4SJacob Faibussowitsch { 4383219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4384219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4385b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4386b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4387b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4388b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4389b6c38306SJunchao Zhang } 4390219fbbafSJunchao Zhang } 4391219fbbafSJunchao Zhang 4392*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4393*d71ae5a4SJacob Faibussowitsch { 4394219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4395219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4396219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4397219fbbafSJunchao Zhang PetscMemType memtype; 4398219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4399219fbbafSJunchao Zhang PetscScalar *Aa; 4400219fbbafSJunchao Zhang 4401219fbbafSJunchao Zhang PetscFunctionBegin; 4402219fbbafSJunchao Zhang if (dev->use_extended_coo) { 44039566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4404219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 44059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 44069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4407219fbbafSJunchao Zhang } 4408219fbbafSJunchao Zhang 44099566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 44109566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4411219fbbafSJunchao Zhang 4412cbc6b225SStefano Zampini if (Annz) { 4413b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 44149566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4415cbc6b225SStefano Zampini } 4416219fbbafSJunchao Zhang 44179566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 44189566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4419219fbbafSJunchao Zhang 44209566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4421219fbbafSJunchao Zhang } else { 44229566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4423219fbbafSJunchao Zhang } 4424219fbbafSJunchao Zhang PetscFunctionReturn(0); 4425219fbbafSJunchao Zhang } 4426219fbbafSJunchao Zhang 44275b7e41feSStefano Zampini /*@C 442811a5261eSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices. 44295b7e41feSStefano Zampini 44305b7e41feSStefano Zampini Not collective 44315b7e41feSStefano Zampini 44325b7e41feSStefano Zampini Input Parameters: 44335b7e41feSStefano Zampini + A - the matrix 443411a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 44355b7e41feSStefano Zampini 44365b7e41feSStefano Zampini Output Parameters: 44375b7e41feSStefano Zampini + ia - the CSR row pointers 44385b7e41feSStefano Zampini - ja - the CSR column indices 44395b7e41feSStefano Zampini 44405b7e41feSStefano Zampini Level: developer 44415b7e41feSStefano Zampini 444211a5261eSBarry Smith Note: 44435b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 44445b7e41feSStefano Zampini 4445db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 44465b7e41feSStefano Zampini @*/ 4447*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4448*d71ae5a4SJacob Faibussowitsch { 44495f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 44505f101d05SStefano Zampini CsrMatrix *csr; 44515f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 44525f101d05SStefano Zampini 44535f101d05SStefano Zampini PetscFunctionBegin; 44545f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44555f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 44565f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4457aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 445928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 44605f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 44615f101d05SStefano Zampini if (i) { 44625f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 44635f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 44645f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 44655f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 44669566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 44675f101d05SStefano Zampini } 44685f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 44695f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 44705f101d05SStefano Zampini } 44715f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 44725f101d05SStefano Zampini PetscFunctionReturn(0); 44735f101d05SStefano Zampini } 44745f101d05SStefano Zampini 44755b7e41feSStefano Zampini /*@C 447611a5261eSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 44775b7e41feSStefano Zampini 44785b7e41feSStefano Zampini Not collective 44795b7e41feSStefano Zampini 44805b7e41feSStefano Zampini Input Parameters: 44815b7e41feSStefano Zampini + A - the matrix 448211a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 44835b7e41feSStefano Zampini 44845b7e41feSStefano Zampini Output Parameters: 44855b7e41feSStefano Zampini + ia - the CSR row pointers 44865b7e41feSStefano Zampini - ja - the CSR column indices 44875b7e41feSStefano Zampini 44885b7e41feSStefano Zampini Level: developer 44895b7e41feSStefano Zampini 4490db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 44915b7e41feSStefano Zampini @*/ 4492*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4493*d71ae5a4SJacob Faibussowitsch { 44945f101d05SStefano Zampini PetscFunctionBegin; 44955f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44965f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 44975f101d05SStefano Zampini if (i) *i = NULL; 44985f101d05SStefano Zampini if (j) *j = NULL; 44995f101d05SStefano Zampini PetscFunctionReturn(0); 45005f101d05SStefano Zampini } 45015f101d05SStefano Zampini 45025b7e41feSStefano Zampini /*@C 450311a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45045b7e41feSStefano Zampini 45055b7e41feSStefano Zampini Not Collective 45065b7e41feSStefano Zampini 45075b7e41feSStefano Zampini Input Parameter: 450811a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45095b7e41feSStefano Zampini 45105b7e41feSStefano Zampini Output Parameter: 45115b7e41feSStefano Zampini . a - pointer to the device data 45125b7e41feSStefano Zampini 45135b7e41feSStefano Zampini Level: developer 45145b7e41feSStefano Zampini 451511a5261eSBarry Smith Note: 451611a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 45175b7e41feSStefano Zampini 4518db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 45195b7e41feSStefano Zampini @*/ 4520*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4521*d71ae5a4SJacob Faibussowitsch { 4522ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4523ed502f03SStefano Zampini CsrMatrix *csr; 4524ed502f03SStefano Zampini 4525ed502f03SStefano Zampini PetscFunctionBegin; 4526ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4527ed502f03SStefano Zampini PetscValidPointer(a, 2); 4528ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4529aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 453128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4532ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 453328b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4534ed502f03SStefano Zampini *a = csr->values->data().get(); 4535ed502f03SStefano Zampini PetscFunctionReturn(0); 4536ed502f03SStefano Zampini } 4537ed502f03SStefano Zampini 45385b7e41feSStefano Zampini /*@C 453911a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 45405b7e41feSStefano Zampini 45415b7e41feSStefano Zampini Not Collective 45425b7e41feSStefano Zampini 45435b7e41feSStefano Zampini Input Parameter: 454411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45455b7e41feSStefano Zampini 45465b7e41feSStefano Zampini Output Parameter: 45475b7e41feSStefano Zampini . a - pointer to the device data 45485b7e41feSStefano Zampini 45495b7e41feSStefano Zampini Level: developer 45505b7e41feSStefano Zampini 4551db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 45525b7e41feSStefano Zampini @*/ 4553*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4554*d71ae5a4SJacob Faibussowitsch { 4555ed502f03SStefano Zampini PetscFunctionBegin; 4556ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4557ed502f03SStefano Zampini PetscValidPointer(a, 2); 4558ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4559ed502f03SStefano Zampini *a = NULL; 4560ed502f03SStefano Zampini PetscFunctionReturn(0); 4561ed502f03SStefano Zampini } 4562ed502f03SStefano Zampini 45635b7e41feSStefano Zampini /*@C 456411a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45655b7e41feSStefano Zampini 45665b7e41feSStefano Zampini Not Collective 45675b7e41feSStefano Zampini 45685b7e41feSStefano Zampini Input Parameter: 456911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45705b7e41feSStefano Zampini 45715b7e41feSStefano Zampini Output Parameter: 45725b7e41feSStefano Zampini . a - pointer to the device data 45735b7e41feSStefano Zampini 45745b7e41feSStefano Zampini Level: developer 45755b7e41feSStefano Zampini 457611a5261eSBarry Smith Note: 457711a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 45785b7e41feSStefano Zampini 4579db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 45805b7e41feSStefano Zampini @*/ 4581*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4582*d71ae5a4SJacob Faibussowitsch { 4583039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4584039c6fbaSStefano Zampini CsrMatrix *csr; 4585039c6fbaSStefano Zampini 4586039c6fbaSStefano Zampini PetscFunctionBegin; 4587039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4588039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4589039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4590aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 459228b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4593039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 459428b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4595039c6fbaSStefano Zampini *a = csr->values->data().get(); 4596039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 45979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4598039c6fbaSStefano Zampini PetscFunctionReturn(0); 4599039c6fbaSStefano Zampini } 46005b7e41feSStefano Zampini /*@C 460111a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4602039c6fbaSStefano Zampini 46035b7e41feSStefano Zampini Not Collective 46045b7e41feSStefano Zampini 46055b7e41feSStefano Zampini Input Parameter: 460611a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46075b7e41feSStefano Zampini 46085b7e41feSStefano Zampini Output Parameter: 46095b7e41feSStefano Zampini . a - pointer to the device data 46105b7e41feSStefano Zampini 46115b7e41feSStefano Zampini Level: developer 46125b7e41feSStefano Zampini 4613db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 46145b7e41feSStefano Zampini @*/ 4615*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4616*d71ae5a4SJacob Faibussowitsch { 4617039c6fbaSStefano Zampini PetscFunctionBegin; 4618039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4619039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4620039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46229566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4623039c6fbaSStefano Zampini *a = NULL; 4624039c6fbaSStefano Zampini PetscFunctionReturn(0); 4625039c6fbaSStefano Zampini } 4626039c6fbaSStefano Zampini 46275b7e41feSStefano Zampini /*@C 462811a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 46295b7e41feSStefano Zampini 46305b7e41feSStefano Zampini Not Collective 46315b7e41feSStefano Zampini 46325b7e41feSStefano Zampini Input Parameter: 463311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46345b7e41feSStefano Zampini 46355b7e41feSStefano Zampini Output Parameter: 46365b7e41feSStefano Zampini . a - pointer to the device data 46375b7e41feSStefano Zampini 46385b7e41feSStefano Zampini Level: developer 46395b7e41feSStefano Zampini 464011a5261eSBarry Smith Note: 464111a5261eSBarry Smith Does not trigger host-device copies and flags data validity on the GPU 46425b7e41feSStefano Zampini 4643db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 46445b7e41feSStefano Zampini @*/ 4645*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4646*d71ae5a4SJacob Faibussowitsch { 4647ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4648ed502f03SStefano Zampini CsrMatrix *csr; 4649ed502f03SStefano Zampini 4650ed502f03SStefano Zampini PetscFunctionBegin; 4651ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4652ed502f03SStefano Zampini PetscValidPointer(a, 2); 4653ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4654aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 465528b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4656ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 465728b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4658ed502f03SStefano Zampini *a = csr->values->data().get(); 4659039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4661ed502f03SStefano Zampini PetscFunctionReturn(0); 4662ed502f03SStefano Zampini } 4663ed502f03SStefano Zampini 46645b7e41feSStefano Zampini /*@C 466511a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 46665b7e41feSStefano Zampini 46675b7e41feSStefano Zampini Not Collective 46685b7e41feSStefano Zampini 46695b7e41feSStefano Zampini Input Parameter: 467011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46715b7e41feSStefano Zampini 46725b7e41feSStefano Zampini Output Parameter: 46735b7e41feSStefano Zampini . a - pointer to the device data 46745b7e41feSStefano Zampini 46755b7e41feSStefano Zampini Level: developer 46765b7e41feSStefano Zampini 4677db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 46785b7e41feSStefano Zampini @*/ 4679*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4680*d71ae5a4SJacob Faibussowitsch { 4681ed502f03SStefano Zampini PetscFunctionBegin; 4682ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4683ed502f03SStefano Zampini PetscValidPointer(a, 2); 4684ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46869566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4687ed502f03SStefano Zampini *a = NULL; 4688ed502f03SStefano Zampini PetscFunctionReturn(0); 4689ed502f03SStefano Zampini } 4690ed502f03SStefano Zampini 46919371c9d4SSatish Balay struct IJCompare4 { 4692*d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4693*d71ae5a4SJacob Faibussowitsch { 4694ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4695ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4696ed502f03SStefano Zampini return false; 4697ed502f03SStefano Zampini } 4698ed502f03SStefano Zampini }; 4699ed502f03SStefano Zampini 47009371c9d4SSatish Balay struct Shift { 4701ed502f03SStefano Zampini int _shift; 4702ed502f03SStefano Zampini 4703ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 47049371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4705ed502f03SStefano Zampini }; 4706ed502f03SStefano Zampini 4707ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4708*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4709*d71ae5a4SJacob Faibussowitsch { 4710ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4711ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4712ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4713ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4714ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4715ed502f03SStefano Zampini cusparseStatus_t stat; 4716ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4717ed502f03SStefano Zampini 4718ed502f03SStefano Zampini PetscFunctionBegin; 4719ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4720ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4721ed502f03SStefano Zampini PetscValidPointer(C, 4); 4722ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4723ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 47245f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 472508401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4726aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4727aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4728ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4729ed502f03SStefano Zampini m = A->rmap->n; 4730ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 47319566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 47329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 47339566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4734ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4735ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4736ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4737ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4738ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4739ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4740ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4741ed502f03SStefano Zampini c->compressedrow.i = NULL; 4742ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4743ed502f03SStefano Zampini Ccusp->workVector = NULL; 4744ed502f03SStefano Zampini Ccusp->nrows = m; 4745ed502f03SStefano Zampini Ccusp->mat = Cmat; 4746ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4747ed502f03SStefano Zampini Ccsr->num_rows = m; 4748ed502f03SStefano Zampini Ccsr->num_cols = n; 47499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 47509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 47519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 47529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 47539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 47549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 47559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 476028b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 476128b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4762ed502f03SStefano Zampini 4763ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4764ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4765ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4766ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4767ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4768ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4769ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4770ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4771ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4772ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4773ed502f03SStefano Zampini if (c->nz) { 47742ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 47752ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 47762ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 47772ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 47782ed87e7eSStefano Zampini 4779ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4780ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4781ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4782ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 47839566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4784ed502f03SStefano Zampini } 47852ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 47862ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4787ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4788ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4789ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4790ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 47919566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4792ed502f03SStefano Zampini } 47932ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 47942ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 47959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 47969371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47979371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47989371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47999371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48002ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 48012ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 48022ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 48038909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4804ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4805ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 48068909a122SStefano Zampini #else 48078909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 48088909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 48098909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 48108909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 48118909a122SStefano Zampini #endif 48122ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 48132ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 48142ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 48152ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 48162ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 48172ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4818ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4819ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4820ed502f03SStefano Zampini thrust::advance(p2, Annz); 4821792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 48228909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 48238909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 48248909a122SStefano Zampini #endif 48252ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 48262ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 48272ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4828792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 48292ed87e7eSStefano Zampini #else 48302ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 4831792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4832792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 48332ed87e7eSStefano Zampini #endif 48349371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48359371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 48372ed87e7eSStefano Zampini delete wPerm; 48382ed87e7eSStefano Zampini delete Acoo; 48392ed87e7eSStefano Zampini delete Bcoo; 48402ed87e7eSStefano Zampini delete Ccoo; 4841ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48429371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48439371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4844ed502f03SStefano Zampini #endif 48451a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 48469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 48479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4848ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4849ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4850ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4851ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4852ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4853ed502f03SStefano Zampini 48541a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 48551a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4856a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4857ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4858ed502f03SStefano Zampini CmatT->mat = CcsrT; 4859ed502f03SStefano Zampini CcsrT->num_rows = n; 4860ed502f03SStefano Zampini CcsrT->num_cols = m; 4861ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4862ed502f03SStefano Zampini 4863ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4864ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4865ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4866ed502f03SStefano Zampini 48679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4868ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4869ed502f03SStefano Zampini if (AT) { 4870ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4871ed502f03SStefano Zampini thrust::advance(rT, -1); 4872ed502f03SStefano Zampini } 4873ed502f03SStefano Zampini if (BT) { 4874ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4875ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4876ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4877ed502f03SStefano Zampini } 4878ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4879ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4880ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4881ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4882ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4883ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4885ed502f03SStefano Zampini 48869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 48879566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 48889566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 48899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 48909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 48919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 48929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4895ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48969371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48979371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4898ed502f03SStefano Zampini #endif 4899ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4900ed502f03SStefano Zampini } 4901ed502f03SStefano Zampini } 4902ed502f03SStefano Zampini 4903ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4904ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4905ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 49069566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 49079566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 4908ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4909ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4910ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4911ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4912ed502f03SStefano Zampini jj = *Ccsr->column_indices; 49139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4915ed502f03SStefano Zampini } else { 49169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4918ed502f03SStefano Zampini } 49199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 49209566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 49219566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4922ed502f03SStefano Zampini c->maxnz = c->nz; 4923ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4924ed502f03SStefano Zampini c->rmax = 0; 4925ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4926ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4927ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4928ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 4929ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4930ed502f03SStefano Zampini } 49319566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 49329566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4933ed502f03SStefano Zampini (*C)->nonzerostate++; 49349566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 49359566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4936ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4937ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4938ed502f03SStefano Zampini } else { 493908401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4940ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4941ed502f03SStefano Zampini if (c->nz) { 4942ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 49435f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4944aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 494508401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 49469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 49479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 49485f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 49495f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4950ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4951ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4952ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4953aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4954aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4955aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4956aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 49575f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4958ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4959ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 49609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 49619371c9d4SSatish Balay auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 49629371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4963ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 49649371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 49659371c9d4SSatish Balay auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4966ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 49679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 49681a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 49695f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4970ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4971ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4972ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4973ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4974ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4975ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4976ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 49771a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4978ed502f03SStefano Zampini } 49799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4980ed502f03SStefano Zampini } 4981ed502f03SStefano Zampini } 49829566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4983ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4984ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4985ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4986ed502f03SStefano Zampini PetscFunctionReturn(0); 4987ed502f03SStefano Zampini } 4988c215019aSStefano Zampini 4989*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4990*d71ae5a4SJacob Faibussowitsch { 4991c215019aSStefano Zampini bool dmem; 4992c215019aSStefano Zampini const PetscScalar *av; 4993c215019aSStefano Zampini 4994c215019aSStefano Zampini PetscFunctionBegin; 4995c215019aSStefano Zampini dmem = isCudaMem(v); 49969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4997c215019aSStefano Zampini if (n && idx) { 4998c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4999c215019aSStefano Zampini widx.assign(idx, idx + n); 50009566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5001c215019aSStefano Zampini 5002c215019aSStefano Zampini THRUSTARRAY *w = NULL; 5003c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 5004c215019aSStefano Zampini if (dmem) { 5005c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 5006c215019aSStefano Zampini } else { 5007c215019aSStefano Zampini w = new THRUSTARRAY(n); 5008c215019aSStefano Zampini dv = w->data(); 5009c215019aSStefano Zampini } 5010c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5011c215019aSStefano Zampini 5012c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5013c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5014c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 501548a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5016c215019aSStefano Zampini delete w; 5017c215019aSStefano Zampini } else { 50189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5019c215019aSStefano Zampini } 50209566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 50219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5022c215019aSStefano Zampini PetscFunctionReturn(0); 5023c215019aSStefano Zampini } 5024