19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 69ae82921SPaul Mullowney 73d13b8fdSMatthew G. Knepley #include <petscconf.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 11af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 129ae82921SPaul Mullowney #undef VecType 133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 16d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 18a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 19d0967f54SJacob Faibussowitsch #endif 20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 21a2cee5feSJed Brown #include <thrust/remove.h> 22a2cee5feSJed Brown #include <thrust/sort.h> 23a2cee5feSJed Brown #include <thrust/unique.h> 24e8d2b73aSMark Adams 25b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30afb2bd1cSJunchao Zhang 31afb2bd1cSJunchao Zhang typedef enum { 32afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 35afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 36afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 37afb2bd1cSJunchao Zhang 38afb2bd1cSJunchao Zhang typedef enum { 39afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 47afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 48afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 49afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 50afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 51afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 52afb2bd1cSJunchao Zhang 53afb2bd1cSJunchao Zhang typedef enum { 5435cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 5535cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 57afb2bd1cSJunchao Zhang */ 58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61afb2bd1cSJunchao Zhang #endif 629ae82921SPaul Mullowney 63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 716fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73d460d7bfSJunchao Zhang #endif 74ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 849ae82921SPaul Mullowney 857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 882c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 897f756511SDominic Meiser 9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9257181aedSStefano Zampini 93c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96c215019aSStefano Zampini 97d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98d71ae5a4SJacob Faibussowitsch { 99aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1006e111a19SKarl Rupp 101ca45077fSPaul Mullowney PetscFunctionBegin; 102ca45077fSPaul Mullowney switch (op) { 103d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT: 104d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 105d71ae5a4SJacob Faibussowitsch break; 106d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_ALL: 107d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 108d71ae5a4SJacob Faibussowitsch break; 109d71ae5a4SJacob Faibussowitsch default: 110d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111ca45077fSPaul Mullowney } 1123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 113ca45077fSPaul Mullowney } 1149ae82921SPaul Mullowney 115e057df02SPaul Mullowney /*@ 11611a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 11711a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 11811a5261eSBarry Smith 119e057df02SPaul Mullowney Not Collective 120e057df02SPaul Mullowney 121e057df02SPaul Mullowney Input Parameters: 12211a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 1232ef1f0ffSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 1242ef1f0ffSBarry Smith `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 12511a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126e057df02SPaul Mullowney 127e057df02SPaul Mullowney Level: intermediate 128e057df02SPaul Mullowney 129fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130e057df02SPaul Mullowney @*/ 131d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132d71ae5a4SJacob Faibussowitsch { 133e057df02SPaul Mullowney PetscFunctionBegin; 134e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 1363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137e057df02SPaul Mullowney } 138e057df02SPaul Mullowney 139d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140d71ae5a4SJacob Faibussowitsch { 141365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142365b711fSMark Adams 143365b711fSMark Adams PetscFunctionBegin; 144365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 1453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 146365b711fSMark Adams } 147365b711fSMark Adams 148365b711fSMark Adams /*@ 14911a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150365b711fSMark Adams 151365b711fSMark Adams Input Parameters: 15211a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 15311a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 154365b711fSMark Adams 1552ef1f0ffSBarry Smith Level: intermediate 156365b711fSMark Adams 15711a5261eSBarry Smith Note: 158365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 159365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161365b711fSMark Adams 1621cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163365b711fSMark Adams @*/ 164d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165d71ae5a4SJacob Faibussowitsch { 166365b711fSMark Adams PetscFunctionBegin; 167365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 1693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 170365b711fSMark Adams } 171365b711fSMark Adams 17266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173d71ae5a4SJacob Faibussowitsch { 174e6e9a74fSStefano Zampini PetscFunctionBegin; 1751a2c6b5cSJunchao Zhang switch (op) { 1761a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1771a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1789566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1791a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1801a2c6b5cSJunchao Zhang break; 181d71ae5a4SJacob Faibussowitsch default: 182d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183d71ae5a4SJacob Faibussowitsch break; 184e6e9a74fSStefano Zampini } 1853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 186e6e9a74fSStefano Zampini } 187e6e9a74fSStefano Zampini 188ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 189d71ae5a4SJacob Faibussowitsch { 190e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1919ae82921SPaul Mullowney PetscBool flg; 192a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1936e111a19SKarl Rupp 1949ae82921SPaul Mullowney PetscFunctionBegin; 195d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 1969ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1979371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 1989566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199afb2bd1cSJunchao Zhang 2009371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2019566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2029566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2039566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2059371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209a435da06SStefano Zampini #else 210aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211a435da06SStefano Zampini #endif 2129371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214afb2bd1cSJunchao Zhang 2159371c9d4SSatish Balay PetscCall( 2169371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218afb2bd1cSJunchao Zhang #endif 2194c87dfd4SPaul Mullowney } 220d0609cedSBarry Smith PetscOptionsHeadEnd(); 2213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2229ae82921SPaul Mullowney } 2239ae82921SPaul Mullowney 224b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226d460d7bfSJunchao Zhang { 227d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 229d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 232d460d7bfSJunchao Zhang PetscInt *Mi, *Mj, Mnz; 233d460d7bfSJunchao Zhang PetscScalar *Ma; 234d460d7bfSJunchao Zhang 235d460d7bfSJunchao Zhang PetscFunctionBegin; 236d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239d460d7bfSJunchao Zhang Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &Mi)); 241d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 243d460d7bfSJunchao Zhang Mi[0] = 0; 244d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 245d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 246d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248d460d7bfSJunchao Zhang Mj[Mi[i] + llen] = i; // diagonal entry 249d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250d460d7bfSJunchao Zhang Mi[i + 1] = Mi[i] + llen + ulen; 251d460d7bfSJunchao Zhang } 252d460d7bfSJunchao Zhang // Copy M (L,U) from host to device 253f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258d460d7bfSJunchao Zhang 259d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267d460d7bfSJunchao Zhang 268d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271d460d7bfSJunchao Zhang 272d460d7bfSJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 273d460d7bfSJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277d460d7bfSJunchao Zhang 278d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 279f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281d460d7bfSJunchao Zhang 282d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284d460d7bfSJunchao Zhang 285d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292d460d7bfSJunchao Zhang 293d460d7bfSJunchao Zhang // Record for reuse 294d460d7bfSJunchao Zhang fs->csrRowPtr_h = Mi; 295d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 296d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 297d460d7bfSJunchao Zhang } 298d460d7bfSJunchao Zhang // Copy the value 299d460d7bfSJunchao Zhang Mi = fs->csrRowPtr_h; 300d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 301d460d7bfSJunchao Zhang Mnz = Mi[m]; 302d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 303d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 304d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306d460d7bfSJunchao Zhang Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308d460d7bfSJunchao Zhang } 309d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310d460d7bfSJunchao Zhang 311*204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 312*204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 313*204a0e31SJunchao Zhang // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 314*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 315*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 316*204a0e31SJunchao Zhang } else 317*204a0e31SJunchao Zhang #endif 318*204a0e31SJunchao Zhang { 319d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 320d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 321d460d7bfSJunchao Zhang 322d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 323*204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 324d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 325d460d7bfSJunchao Zhang } 326*204a0e31SJunchao Zhang } 327d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 328d460d7bfSJunchao Zhang } 329d460d7bfSJunchao Zhang #else 330d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 331d71ae5a4SJacob Faibussowitsch { 3329ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3339ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3349ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 335aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 3369ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 3379ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3389ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3399ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 3409ae82921SPaul Mullowney 3419ae82921SPaul Mullowney PetscFunctionBegin; 3423ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 343c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3449ae82921SPaul Mullowney try { 3459ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3469ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 347da79fbbcSStefano Zampini if (!loTriFactor) { 3482cbc15d9SMark PetscScalar *AALo; 3492cbc15d9SMark 3509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 3519ae82921SPaul Mullowney 3529ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 3549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 3559ae82921SPaul Mullowney 3569ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3579ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 3589ae82921SPaul Mullowney AiLo[n] = nzLower; 3599ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 3609ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 3619ae82921SPaul Mullowney v = aa; 3629ae82921SPaul Mullowney vi = aj; 3639ae82921SPaul Mullowney offset = 1; 3649ae82921SPaul Mullowney rowOffset = 1; 3659ae82921SPaul Mullowney for (i = 1; i < n; i++) { 3669ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 367e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3689ae82921SPaul Mullowney AiLo[i] = rowOffset; 3699ae82921SPaul Mullowney rowOffset += nz + 1; 3709ae82921SPaul Mullowney 371f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 372f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 3739ae82921SPaul Mullowney 3749ae82921SPaul Mullowney offset += nz; 3759ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 3769ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 3779ae82921SPaul Mullowney offset += 1; 3789ae82921SPaul Mullowney 3799ae82921SPaul Mullowney v += nz; 3809ae82921SPaul Mullowney vi += nz; 3819ae82921SPaul Mullowney } 3822205254eSKarl Rupp 383aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3849566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 385da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 386aa372e3fSPaul Mullowney /* Create the matrix description */ 3879566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3889566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 391afb2bd1cSJunchao Zhang #else 3929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 393afb2bd1cSJunchao Zhang #endif 3949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 396aa372e3fSPaul Mullowney 397aa372e3fSPaul Mullowney /* set the operation */ 398aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 399aa372e3fSPaul Mullowney 400aa372e3fSPaul Mullowney /* set the matrix */ 401aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 402aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 403aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 404aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 405aa372e3fSPaul Mullowney 406aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 407aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 408aa372e3fSPaul Mullowney 409aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 410aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 411aa372e3fSPaul Mullowney 412aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 413aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 414aa372e3fSPaul Mullowney 415afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4169566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 417261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 4181b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4199371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4209371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 4219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 422afb2bd1cSJunchao Zhang #endif 423afb2bd1cSJunchao Zhang 424aa372e3fSPaul Mullowney /* perform the solve analysis */ 4259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4269f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 4279566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4289566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 429aa372e3fSPaul Mullowney 430da79fbbcSStefano Zampini /* assign the pointer */ 431aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 4322cbc15d9SMark loTriFactor->AA_h = AALo; 4339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4359566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 436da79fbbcSStefano Zampini } else { /* update values only */ 43748a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 438da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4392cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 440da79fbbcSStefano Zampini v = aa; 441da79fbbcSStefano Zampini vi = aj; 442da79fbbcSStefano Zampini offset = 1; 443da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 444da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 445f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 446da79fbbcSStefano Zampini offset += nz; 4472cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 448da79fbbcSStefano Zampini offset += 1; 449da79fbbcSStefano Zampini v += nz; 450da79fbbcSStefano Zampini } 4512cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 4529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 453da79fbbcSStefano Zampini } 454d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 455d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 456d71ae5a4SJacob Faibussowitsch } 4579ae82921SPaul Mullowney } 4583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4599ae82921SPaul Mullowney } 4609ae82921SPaul Mullowney 461d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 462d71ae5a4SJacob Faibussowitsch { 4639ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4649ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4659ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 466aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 4679ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 4689ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 4699ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4709ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 4719ae82921SPaul Mullowney 4729ae82921SPaul Mullowney PetscFunctionBegin; 4733ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 474c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4759ae82921SPaul Mullowney try { 4769ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4779ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 478da79fbbcSStefano Zampini if (!upTriFactor) { 4792cbc15d9SMark PetscScalar *AAUp; 4802cbc15d9SMark 4819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4822cbc15d9SMark 4839ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4869ae82921SPaul Mullowney 4879ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4889ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4899ae82921SPaul Mullowney AiUp[n] = nzUpper; 4909ae82921SPaul Mullowney offset = nzUpper; 4919ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4929ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4939ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4949ae82921SPaul Mullowney 495e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4969ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4979ae82921SPaul Mullowney 498e057df02SPaul Mullowney /* decrement the offset */ 4999ae82921SPaul Mullowney offset -= (nz + 1); 5009ae82921SPaul Mullowney 501e057df02SPaul Mullowney /* first, set the diagonal elements */ 5029ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 50309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 5049ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 5059ae82921SPaul Mullowney 506f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 507f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 5089ae82921SPaul Mullowney } 5092205254eSKarl Rupp 510aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5119566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 512da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5132205254eSKarl Rupp 514aa372e3fSPaul Mullowney /* Create the matrix description */ 5159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 519afb2bd1cSJunchao Zhang #else 5209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 521afb2bd1cSJunchao Zhang #endif 5229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 524aa372e3fSPaul Mullowney 525aa372e3fSPaul Mullowney /* set the operation */ 526aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 527aa372e3fSPaul Mullowney 528aa372e3fSPaul Mullowney /* set the matrix */ 529aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 530aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 531aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 532aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 533aa372e3fSPaul Mullowney 534aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 535aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 536aa372e3fSPaul Mullowney 537aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 538aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 539aa372e3fSPaul Mullowney 540aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 541aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 542aa372e3fSPaul Mullowney 543afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 5449566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 545261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 5461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5479371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5489371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 5499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 550afb2bd1cSJunchao Zhang #endif 551afb2bd1cSJunchao Zhang 552aa372e3fSPaul Mullowney /* perform the solve analysis */ 5539371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5549f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 5559f7ba44dSJacob Faibussowitsch 5569566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5579566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 558aa372e3fSPaul Mullowney 559da79fbbcSStefano Zampini /* assign the pointer */ 560aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 5612cbc15d9SMark upTriFactor->AA_h = AAUp; 5629566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5649566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 565da79fbbcSStefano Zampini } else { 56648a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 567da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 568da79fbbcSStefano Zampini offset = nzUpper; 569da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 570da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 571da79fbbcSStefano Zampini 572da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 573da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 574da79fbbcSStefano Zampini 575da79fbbcSStefano Zampini /* decrement the offset */ 576da79fbbcSStefano Zampini offset -= (nz + 1); 577da79fbbcSStefano Zampini 578da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5792cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 580f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 581da79fbbcSStefano Zampini } 5822cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5839566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 584da79fbbcSStefano Zampini } 585d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 586d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 587d71ae5a4SJacob Faibussowitsch } 5889ae82921SPaul Mullowney } 5893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5909ae82921SPaul Mullowney } 591d460d7bfSJunchao Zhang #endif 5929ae82921SPaul Mullowney 593d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 594d71ae5a4SJacob Faibussowitsch { 5959ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5969ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 597c9e33d71SJunchao Zhang IS isrow = a->row, isicol = a->icol; 5989ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5999ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6009ae82921SPaul Mullowney 6019ae82921SPaul Mullowney PetscFunctionBegin; 60228b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 603b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 604d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 605d460d7bfSJunchao Zhang #else 6069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 6079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 608ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 609d460d7bfSJunchao Zhang #endif 610d460d7bfSJunchao Zhang 611aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 6129ae82921SPaul Mullowney 613d460d7bfSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 614e057df02SPaul Mullowney /* lower triangular indices */ 6159566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 616da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 617da79fbbcSStefano Zampini const PetscInt *r; 618da79fbbcSStefano Zampini 6199566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 620aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 621aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 6229566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 6239566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 624da79fbbcSStefano Zampini } 6259ae82921SPaul Mullowney 626e057df02SPaul Mullowney /* upper triangular indices */ 627c9e33d71SJunchao Zhang PetscCall(ISIdentity(isicol, &col_identity)); 628da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 629da79fbbcSStefano Zampini const PetscInt *c; 630da79fbbcSStefano Zampini 631c9e33d71SJunchao Zhang PetscCall(ISGetIndices(isicol, &c)); 632aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 633aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 634c9e33d71SJunchao Zhang PetscCall(ISRestoreIndices(isicol, &c)); 6359566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 636da79fbbcSStefano Zampini } 6373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6389ae82921SPaul Mullowney } 6399ae82921SPaul Mullowney 640b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 641d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 642d460d7bfSJunchao Zhang { 643d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 644d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 645d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 646d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 647d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 648d460d7bfSJunchao Zhang PetscInt *Mj, Mnz; 649d460d7bfSJunchao Zhang PetscScalar *Ma, *D; 650d460d7bfSJunchao Zhang 651d460d7bfSJunchao Zhang PetscFunctionBegin; 652d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 653d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 654d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 655d460d7bfSJunchao Zhang // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 656d460d7bfSJunchao Zhang Mnz = Ai[m]; // Unz (with the unit diagonal) 657d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 658d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 659d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m, &D)); // the diagonal 660d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 661d460d7bfSJunchao Zhang PetscInt ulen = Ai[i + 1] - Ai[i]; 662d460d7bfSJunchao Zhang Mj[Ai[i]] = i; // diagonal entry 663d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 664d460d7bfSJunchao Zhang } 665d460d7bfSJunchao Zhang // Copy M (U) from host to device 666f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 667f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 668f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 669f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 670d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 671d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 672d460d7bfSJunchao Zhang 673d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 674d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 675d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 676d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 677d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 678d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 679d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 680d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 681d460d7bfSJunchao Zhang 682d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 683d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 684d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 685d460d7bfSJunchao Zhang 686d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 687f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 688f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 689d460d7bfSJunchao Zhang 690d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 691d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 692d460d7bfSJunchao Zhang 693d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers 694d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 695d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 696d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 697d460d7bfSJunchao Zhang 698aaa8cc7dSPierre Jolivet PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 699d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 700d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 701d460d7bfSJunchao Zhang 702d460d7bfSJunchao Zhang // Record for reuse 703d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 704d460d7bfSJunchao Zhang fs->diag_h = D; 705d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 706d460d7bfSJunchao Zhang } 707d460d7bfSJunchao Zhang // Copy the value 708d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 709d460d7bfSJunchao Zhang D = fs->diag_h; 710d460d7bfSJunchao Zhang Mnz = Ai[m]; 711d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 712d460d7bfSJunchao Zhang D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 713d460d7bfSJunchao Zhang Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 714d460d7bfSJunchao Zhang for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 715d460d7bfSJunchao Zhang } 716d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 717d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 718d460d7bfSJunchao Zhang 719*204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 720*204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 721*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 722*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 723*204a0e31SJunchao Zhang } else 724*204a0e31SJunchao Zhang #endif 725*204a0e31SJunchao Zhang { 726d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 727d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 728d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 729*204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 730*204a0e31SJunchao Zhang } 731d460d7bfSJunchao Zhang } 732d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 733d460d7bfSJunchao Zhang } 734d460d7bfSJunchao Zhang 735d460d7bfSJunchao Zhang // Solve Ut D U x = b 736d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 737d460d7bfSJunchao Zhang { 738d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 739d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 740d460d7bfSJunchao Zhang const PetscScalar *barray; 741d460d7bfSJunchao Zhang PetscScalar *xarray; 742d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 743d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 744d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 745d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 746d460d7bfSJunchao Zhang 747d460d7bfSJunchao Zhang PetscFunctionBegin; 748d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 749d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 750d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 751d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 752d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 753d460d7bfSJunchao Zhang 754d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 755d460d7bfSJunchao Zhang if (fs->rpermIndices) { 756d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 757d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 758d460d7bfSJunchao Zhang } else { 759d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 760d460d7bfSJunchao Zhang } 761d460d7bfSJunchao Zhang 762d460d7bfSJunchao Zhang // Solve Ut Y = X 763d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 764d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 765d460d7bfSJunchao Zhang 766d460d7bfSJunchao Zhang // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 767d460d7bfSJunchao Zhang // It is basically a vector element-wise multiplication, but cublas does not have it! 768d460d7bfSJunchao Zhang PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 769d460d7bfSJunchao Zhang 770d460d7bfSJunchao Zhang // Solve U X = Y 771d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 772d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 773d460d7bfSJunchao Zhang } else { 774d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 775d460d7bfSJunchao Zhang } 776d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 777d460d7bfSJunchao Zhang 778d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 779d460d7bfSJunchao Zhang if (fs->cpermIndices) { 780d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 781d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 782d460d7bfSJunchao Zhang } 783d460d7bfSJunchao Zhang 784d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 785d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 786d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 787d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 788d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 789d460d7bfSJunchao Zhang } 790d460d7bfSJunchao Zhang #else 791d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 792d71ae5a4SJacob Faibussowitsch { 793087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 794087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 795aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 796aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 797087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 798087f3262SPaul Mullowney PetscScalar *AAUp; 799087f3262SPaul Mullowney PetscScalar *AALo; 800087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 801087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 802087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 803087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 804087f3262SPaul Mullowney 805087f3262SPaul Mullowney PetscFunctionBegin; 8063ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 807c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 808087f3262SPaul Mullowney try { 8099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 8109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 811da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 812087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 8139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 8149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 815087f3262SPaul Mullowney 816087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 817087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 818087f3262SPaul Mullowney AiUp[n] = nzUpper; 819087f3262SPaul Mullowney offset = 0; 820087f3262SPaul Mullowney for (i = 0; i < n; i++) { 821087f3262SPaul Mullowney /* set the pointers */ 822087f3262SPaul Mullowney v = aa + ai[i]; 823087f3262SPaul Mullowney vj = aj + ai[i]; 824087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 825087f3262SPaul Mullowney 826087f3262SPaul Mullowney /* first, set the diagonal elements */ 827087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 82809f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 829087f3262SPaul Mullowney AiUp[i] = offset; 83009f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 831087f3262SPaul Mullowney 832087f3262SPaul Mullowney offset += 1; 833087f3262SPaul Mullowney if (nz > 0) { 834f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 835f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 836087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 837087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 838087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 839087f3262SPaul Mullowney } 840087f3262SPaul Mullowney offset += nz; 841087f3262SPaul Mullowney } 842087f3262SPaul Mullowney } 843087f3262SPaul Mullowney 844aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8459566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 846da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 847087f3262SPaul Mullowney 848aa372e3fSPaul Mullowney /* Create the matrix description */ 8499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 8509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 853afb2bd1cSJunchao Zhang #else 8549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 855afb2bd1cSJunchao Zhang #endif 8569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 858087f3262SPaul Mullowney 859aa372e3fSPaul Mullowney /* set the matrix */ 860aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 861aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 862aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 863aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 864aa372e3fSPaul Mullowney 865aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 866aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 867aa372e3fSPaul Mullowney 868aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 869aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 870aa372e3fSPaul Mullowney 871aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 872aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 873aa372e3fSPaul Mullowney 874afb2bd1cSJunchao Zhang /* set the operation */ 875afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 876afb2bd1cSJunchao Zhang 877afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8789566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 879261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 8801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8829371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 8839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 884afb2bd1cSJunchao Zhang #endif 885afb2bd1cSJunchao Zhang 886aa372e3fSPaul Mullowney /* perform the solve analysis */ 8879371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8889f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 8899f7ba44dSJacob Faibussowitsch 8909566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8919566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 892aa372e3fSPaul Mullowney 893da79fbbcSStefano Zampini /* assign the pointer */ 894aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 895aa372e3fSPaul Mullowney 896aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8979566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 898da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 899aa372e3fSPaul Mullowney 900aa372e3fSPaul Mullowney /* Create the matrix description */ 9019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 9029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 9031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 905afb2bd1cSJunchao Zhang #else 9069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 907afb2bd1cSJunchao Zhang #endif 9089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 9099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 910aa372e3fSPaul Mullowney 911aa372e3fSPaul Mullowney /* set the operation */ 912aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 913aa372e3fSPaul Mullowney 914aa372e3fSPaul Mullowney /* set the matrix */ 915aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 916aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 917aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 918aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 919aa372e3fSPaul Mullowney 920aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 921aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 922aa372e3fSPaul Mullowney 923aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 924aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 925aa372e3fSPaul Mullowney 926aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 927aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 928aa372e3fSPaul Mullowney 929afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 931261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 9321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9339371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9349371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 9359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 936afb2bd1cSJunchao Zhang #endif 937afb2bd1cSJunchao Zhang 938aa372e3fSPaul Mullowney /* perform the solve analysis */ 9399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9409f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 9419f7ba44dSJacob Faibussowitsch 9429566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9439566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 944aa372e3fSPaul Mullowney 945da79fbbcSStefano Zampini /* assign the pointer */ 946aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 947087f3262SPaul Mullowney 9489566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 9499566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 9509566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 951da79fbbcSStefano Zampini } else { 952da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 953da79fbbcSStefano Zampini offset = 0; 954da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 955da79fbbcSStefano Zampini /* set the pointers */ 956da79fbbcSStefano Zampini v = aa + ai[i]; 957da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 958da79fbbcSStefano Zampini 959da79fbbcSStefano Zampini /* first, set the diagonal elements */ 960da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 961da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 962da79fbbcSStefano Zampini 963da79fbbcSStefano Zampini offset += 1; 964da79fbbcSStefano Zampini if (nz > 0) { 965f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 966da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 967da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 968da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 969da79fbbcSStefano Zampini } 970da79fbbcSStefano Zampini offset += nz; 971da79fbbcSStefano Zampini } 972da79fbbcSStefano Zampini } 97328b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 97428b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 975da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 976da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 9779566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 978da79fbbcSStefano Zampini } 9799566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9809566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 981d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 982d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 983d71ae5a4SJacob Faibussowitsch } 984087f3262SPaul Mullowney } 9853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 986087f3262SPaul Mullowney } 987d460d7bfSJunchao Zhang #endif 988087f3262SPaul Mullowney 989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 990d71ae5a4SJacob Faibussowitsch { 991087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 992087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 993087f3262SPaul Mullowney IS ip = a->row; 994087f3262SPaul Mullowney PetscBool perm_identity; 995087f3262SPaul Mullowney PetscInt n = A->rmap->n; 996087f3262SPaul Mullowney 997087f3262SPaul Mullowney PetscFunctionBegin; 99828b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 999d460d7bfSJunchao Zhang 1000b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1001d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 1002d460d7bfSJunchao Zhang #else 10039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 1004ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 1005d460d7bfSJunchao Zhang #endif 1006aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 1007aa372e3fSPaul Mullowney 1008da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 1009da79fbbcSStefano Zampini 1010087f3262SPaul Mullowney /* lower triangular indices */ 10119566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 1012087f3262SPaul Mullowney if (!perm_identity) { 10134e4bbfaaSStefano Zampini IS iip; 1014da79fbbcSStefano Zampini const PetscInt *irip, *rip; 10154e4bbfaaSStefano Zampini 10169566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 10179566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 10189566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 1019aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1020aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1021aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 10224e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 10239566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 10249566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 10259566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 10269566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1027da79fbbcSStefano Zampini } 10283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1029087f3262SPaul Mullowney } 1030087f3262SPaul Mullowney 1031d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1032d71ae5a4SJacob Faibussowitsch { 1033087f3262SPaul Mullowney PetscFunctionBegin; 10349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 10359566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1036ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1037d460d7bfSJunchao Zhang 1038b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1039d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1040d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1041d460d7bfSJunchao Zhang #else 1042087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1043d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1044d460d7bfSJunchao Zhang IS ip = b->row; 1045d460d7bfSJunchao Zhang PetscBool perm_identity; 1046d460d7bfSJunchao Zhang 10479566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 1048087f3262SPaul Mullowney if (perm_identity) { 1049087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1050087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1051087f3262SPaul Mullowney } else { 1052087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1053087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1054d460d7bfSJunchao Zhang } 1055d460d7bfSJunchao Zhang #endif 10564e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10574e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1058087f3262SPaul Mullowney 1059087f3262SPaul Mullowney /* get the triangular factors */ 10609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 10613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1062087f3262SPaul Mullowney } 10639ae82921SPaul Mullowney 1064b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1065d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1066d71ae5a4SJacob Faibussowitsch { 1067bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1068aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1069aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1070da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1071da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1072aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1073aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1074aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1075aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 1076b175d8bbSPaul Mullowney 1077bda325fcSPaul Mullowney PetscFunctionBegin; 1078aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10799566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1080da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1081aa372e3fSPaul Mullowney 1082aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1083aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1084aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 10859371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1086aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1087aa372e3fSPaul Mullowney 1088aa372e3fSPaul Mullowney /* Create the matrix description */ 10899566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1094aa372e3fSPaul Mullowney 1095aa372e3fSPaul Mullowney /* set the operation */ 1096aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1097aa372e3fSPaul Mullowney 1098aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1099aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1100afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1101afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1102aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1103afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1104afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1105afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1106aa372e3fSPaul Mullowney 1107aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1108afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11099371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 11109371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 11119371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 11129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1113afb2bd1cSJunchao Zhang #endif 1114afb2bd1cSJunchao Zhang 11159566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 11169f7ba44dSJacob Faibussowitsch { 11179f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11189f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 11199371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1120afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11219f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1122afb2bd1cSJunchao Zhang #else 11239f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1124afb2bd1cSJunchao Zhang #endif 11259f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11269f7ba44dSJacob Faibussowitsch } 11279f7ba44dSJacob Faibussowitsch 11289566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11299566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1130aa372e3fSPaul Mullowney 1131afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11329566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1133261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 11341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11359371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11369371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 11379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1138afb2bd1cSJunchao Zhang #endif 1139afb2bd1cSJunchao Zhang 1140afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11419371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11429f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 11439f7ba44dSJacob Faibussowitsch 11449566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11459566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1146aa372e3fSPaul Mullowney 1147da79fbbcSStefano Zampini /* assign the pointer */ 1148aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1149aa372e3fSPaul Mullowney 1150aa372e3fSPaul Mullowney /*********************************************/ 1151aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1152aa372e3fSPaul Mullowney /*********************************************/ 1153aa372e3fSPaul Mullowney 1154aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 11559566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1156da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1157aa372e3fSPaul Mullowney 1158aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1159aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1160aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 11619371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1162aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1163aa372e3fSPaul Mullowney 1164aa372e3fSPaul Mullowney /* Create the matrix description */ 11659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1170aa372e3fSPaul Mullowney 1171aa372e3fSPaul Mullowney /* set the operation */ 1172aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1173aa372e3fSPaul Mullowney 1174aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1175aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1176afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1177afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1178aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1179afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1180afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1181afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1182aa372e3fSPaul Mullowney 1183aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1184afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11859371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 11869371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11879371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1189afb2bd1cSJunchao Zhang #endif 1190afb2bd1cSJunchao Zhang 11919566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 11929f7ba44dSJacob Faibussowitsch { 11939f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11949f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 11959371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1196afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11979f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1198afb2bd1cSJunchao Zhang #else 11999f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1200afb2bd1cSJunchao Zhang #endif 12019f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 12029f7ba44dSJacob Faibussowitsch } 1203d49cd2b7SBarry Smith 12049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 12059566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1206aa372e3fSPaul Mullowney 1207afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 12089566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1209261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 12101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12119371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 12129371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 12139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1214afb2bd1cSJunchao Zhang #endif 1215afb2bd1cSJunchao Zhang 1216afb2bd1cSJunchao Zhang /* perform the solve analysis */ 12175f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 12189371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 12199f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1220d49cd2b7SBarry Smith 12219566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 12229566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1223aa372e3fSPaul Mullowney 1224da79fbbcSStefano Zampini /* assign the pointer */ 1225aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 12263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1227bda325fcSPaul Mullowney } 1228d460d7bfSJunchao Zhang #endif 1229bda325fcSPaul Mullowney 12309371c9d4SSatish Balay struct PetscScalarToPetscInt { 12319371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1232a49f1ed0SStefano Zampini }; 1233a49f1ed0SStefano Zampini 1234d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1235d71ae5a4SJacob Faibussowitsch { 1236aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1237a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1238bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1239bda325fcSPaul Mullowney cusparseStatus_t stat; 1240aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1241b175d8bbSPaul Mullowney 1242bda325fcSPaul Mullowney PetscFunctionBegin; 12439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1244a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 124528b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1246a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 124708401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 12483ba16761SJacob Faibussowitsch if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 12499566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 12509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 125148a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1252a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1253aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12549566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1255aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1258aa372e3fSPaul Mullowney 1259b06137fdSPaul Mullowney /* set alpha and beta */ 1260f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1261f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1262f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 12639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1266b06137fdSPaul Mullowney 1267aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1268aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1269a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1270554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1271554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1272aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1273a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1274aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1275aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1276a3fdcf43SKarl Rupp 1277ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 127881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1279afb2bd1cSJunchao Zhang 1280afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 12813606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 12829371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12839371c9d4SSatish Balay indexBase, cusparse_scalartype); 12849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12853606e59fSJunchao Zhang #else 12863606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12873606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12883606e59fSJunchao Zhang 12893606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12903606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12913606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12923606e59fSJunchao Zhang */ 12933606e59fSJunchao Zhang if (matrixT->num_entries) { 12949371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 12959371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12963606e59fSJunchao Zhang 12973606e59fSJunchao Zhang } else { 12983606e59fSJunchao Zhang matstructT->matDescr = NULL; 12993606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 13003606e59fSJunchao Zhang } 13013606e59fSJunchao Zhang #endif 1302afb2bd1cSJunchao Zhang #endif 1303aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1304afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1305afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1306afb2bd1cSJunchao Zhang #else 1307aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 130851c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 130951c6d536SStefano Zampini /* First convert HYB to CSR */ 1310aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1311aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1312aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1313aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1314aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1315aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1316aa372e3fSPaul Mullowney 13179371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 13189371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1319aa372e3fSPaul Mullowney 1320aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1321aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1322aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1323aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1324aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1325aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1326aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1327aa372e3fSPaul Mullowney 13289371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 13299371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13309371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1331aa372e3fSPaul Mullowney 1332aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1333aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 13359371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 13369371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 13379371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1338aa372e3fSPaul Mullowney 1339aa372e3fSPaul Mullowney /* assign the pointer */ 1340aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13411a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1342aa372e3fSPaul Mullowney /* delete temporaries */ 1343aa372e3fSPaul Mullowney if (tempT) { 1344aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1345aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1346aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1347aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1348087f3262SPaul Mullowney } 1349aa372e3fSPaul Mullowney if (temp) { 1350aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1351aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1352aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1353aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1354aa372e3fSPaul Mullowney } 1355afb2bd1cSJunchao Zhang #endif 1356aa372e3fSPaul Mullowney } 1357a49f1ed0SStefano Zampini } 1358a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1359a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1360a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 136128b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 136228b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 136328b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 136428b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 136528b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 136628b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 136728b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 136828b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1369a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1370a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1371a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 13729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1373a49f1ed0SStefano Zampini } 1374a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1375a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1376792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1377a49f1ed0SStefano Zampini 1378a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1380a49f1ed0SStefano Zampini void *csr2cscBuffer; 1381a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 13829371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 13839371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 13849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 13859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1386a49f1ed0SStefano Zampini #endif 1387a49f1ed0SStefano Zampini 13881a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13891a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13901a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13911a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13921a2c6b5cSJunchao Zhang 13931a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13941a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13951a2c6b5cSJunchao Zhang */ 13969371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1397a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13989371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 13999371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1400a49f1ed0SStefano Zampini #else 14019371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 14029371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1403a49f1ed0SStefano Zampini #endif 14041a2c6b5cSJunchao Zhang } else { 14051a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 14061a2c6b5cSJunchao Zhang } 14071a2c6b5cSJunchao Zhang 1408a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1409792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1410a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 14119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1412a49f1ed0SStefano Zampini #endif 1413a49f1ed0SStefano Zampini } 14149371c9d4SSatish Balay PetscCallThrust( 14159371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1416a49f1ed0SStefano Zampini } 14179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14189566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1419213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1420213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1421aa372e3fSPaul Mullowney /* assign the pointer */ 1422aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 14231a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 14243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1425bda325fcSPaul Mullowney } 1426bda325fcSPaul Mullowney 1427b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1428d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1429d460d7bfSJunchao Zhang { 1430d460d7bfSJunchao Zhang const PetscScalar *barray; 1431d460d7bfSJunchao Zhang PetscScalar *xarray; 1432d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1433d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1434d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1435d460d7bfSJunchao Zhang const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1436d460d7bfSJunchao Zhang const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1437d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1438d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1439d460d7bfSJunchao Zhang 1440d460d7bfSJunchao Zhang PetscFunctionBegin; 1441d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1442d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1443d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1444d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1445d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1446d460d7bfSJunchao Zhang 1447d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1448d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1449d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1450d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1451d460d7bfSJunchao Zhang } else { 1452d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1453d460d7bfSJunchao Zhang } 1454d460d7bfSJunchao Zhang 1455d460d7bfSJunchao Zhang // Solve L Y = X 1456d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1457d460d7bfSJunchao Zhang // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1458d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1459d460d7bfSJunchao Zhang 1460d460d7bfSJunchao Zhang // Solve U X = Y 1461d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1462d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1463d460d7bfSJunchao Zhang } else { 1464d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1465d460d7bfSJunchao Zhang } 1466d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1467d460d7bfSJunchao Zhang 1468d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1469d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1470d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1471d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1472d460d7bfSJunchao Zhang } 1473d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1474d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1475d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1476d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1477d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1478d460d7bfSJunchao Zhang } 1479d460d7bfSJunchao Zhang 1480d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1481d460d7bfSJunchao Zhang { 1482d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1483d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1484d460d7bfSJunchao Zhang const PetscScalar *barray; 1485d460d7bfSJunchao Zhang PetscScalar *xarray; 1486d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1487d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1488d460d7bfSJunchao Zhang const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1489d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1490d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1491d460d7bfSJunchao Zhang 1492d460d7bfSJunchao Zhang PetscFunctionBegin; 1493d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1494d460d7bfSJunchao Zhang if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1495d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1496d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1497d460d7bfSJunchao Zhang fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1498d460d7bfSJunchao Zhang 1499d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1500d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1501d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1502d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1503d460d7bfSJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 1504d460d7bfSJunchao Zhang } 1505d460d7bfSJunchao Zhang 1506d460d7bfSJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 1507d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1508d460d7bfSJunchao Zhang 1509d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1510d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1511d460d7bfSJunchao Zhang } 1512d460d7bfSJunchao Zhang 1513d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1514d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1515d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1516d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1517d460d7bfSJunchao Zhang 1518d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1519d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1520d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1521d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1522d460d7bfSJunchao Zhang } else { 1523d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1524d460d7bfSJunchao Zhang } 1525d460d7bfSJunchao Zhang 1526d460d7bfSJunchao Zhang // Solve Ut Y = X 1527d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1528d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1529d460d7bfSJunchao Zhang 1530d460d7bfSJunchao Zhang // Solve Lt X = Y 1531d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1532d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1533d460d7bfSJunchao Zhang } else { 1534d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1535d460d7bfSJunchao Zhang } 1536d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1537d460d7bfSJunchao Zhang 1538d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1539d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1540d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1541d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1542d460d7bfSJunchao Zhang } 1543d460d7bfSJunchao Zhang 1544d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1545d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1546d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1547d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1548d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1549d460d7bfSJunchao Zhang } 1550d460d7bfSJunchao Zhang #else 1551a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1553d71ae5a4SJacob Faibussowitsch { 1554c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1555465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1556465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1557465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1558465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1559bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1560aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1561aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1562aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1563bda325fcSPaul Mullowney 1564bda325fcSPaul Mullowney PetscFunctionBegin; 1565aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1566aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1568aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1569aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1570bda325fcSPaul Mullowney } 1571bda325fcSPaul Mullowney 1572bda325fcSPaul Mullowney /* Get the GPU pointers */ 15739566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 15749566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1575c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1576c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1577bda325fcSPaul Mullowney 15789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1579aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 15809371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1581aa372e3fSPaul Mullowney 1582aa372e3fSPaul Mullowney /* First, solve U */ 15839f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 15849f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1585aa372e3fSPaul Mullowney 1586aa372e3fSPaul Mullowney /* Then, solve L */ 15879f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 15889f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1589aa372e3fSPaul Mullowney 1590aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 15919371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1592aa372e3fSPaul Mullowney 1593aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1594a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1595bda325fcSPaul Mullowney 1596bda325fcSPaul Mullowney /* restore */ 15979566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 15989566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 15999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1602bda325fcSPaul Mullowney } 1603bda325fcSPaul Mullowney 1604d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1605d71ae5a4SJacob Faibussowitsch { 1606465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1607465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1608bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1609aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1610aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1611aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1612bda325fcSPaul Mullowney 1613bda325fcSPaul Mullowney PetscFunctionBegin; 1614aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1615aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 16169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1617aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1618aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1619bda325fcSPaul Mullowney } 1620bda325fcSPaul Mullowney 1621bda325fcSPaul Mullowney /* Get the GPU pointers */ 16229566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16239566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1624bda325fcSPaul Mullowney 16259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1626aa372e3fSPaul Mullowney /* First, solve U */ 16279f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 16289f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1629aa372e3fSPaul Mullowney 1630aa372e3fSPaul Mullowney /* Then, solve L */ 16319f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 16329f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1633bda325fcSPaul Mullowney 1634bda325fcSPaul Mullowney /* restore */ 16359566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16369566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1640bda325fcSPaul Mullowney } 1641bda325fcSPaul Mullowney 1642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1643d71ae5a4SJacob Faibussowitsch { 1644465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1645465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1646465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1647465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16489ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1649aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1650aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1651aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16529ae82921SPaul Mullowney 16539ae82921SPaul Mullowney PetscFunctionBegin; 1654e057df02SPaul Mullowney /* Get the GPU pointers */ 16559566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16569566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1657c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1658c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16599ae82921SPaul Mullowney 16609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1661aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 16629371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1663aa372e3fSPaul Mullowney 1664aa372e3fSPaul Mullowney /* Next, solve L */ 16659f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16669f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1667aa372e3fSPaul Mullowney 1668aa372e3fSPaul Mullowney /* Then, solve U */ 16699f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16709f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1671d49cd2b7SBarry Smith 16724e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 16739371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 16749ae82921SPaul Mullowney 16759566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16769566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16809ae82921SPaul Mullowney } 16819ae82921SPaul Mullowney 1682d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1683d71ae5a4SJacob Faibussowitsch { 1684465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1685465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16869ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1687aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1688aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1689aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16909ae82921SPaul Mullowney 16919ae82921SPaul Mullowney PetscFunctionBegin; 1692e057df02SPaul Mullowney /* Get the GPU pointers */ 16939566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16949566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 16959ae82921SPaul Mullowney 16969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1697aa372e3fSPaul Mullowney /* First, solve L */ 16989f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16999f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1700d49cd2b7SBarry Smith 1701aa372e3fSPaul Mullowney /* Next, solve U */ 17029f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 17039f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 17049ae82921SPaul Mullowney 17059566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 17069566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 17079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 17093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 17109ae82921SPaul Mullowney } 1711d460d7bfSJunchao Zhang #endif 17129ae82921SPaul Mullowney 1713b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 17148eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1715d71ae5a4SJacob Faibussowitsch { 1716da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1717da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1718da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1719da112707SJunchao Zhang CsrMatrix *Acsr; 1720da112707SJunchao Zhang PetscInt m, nz; 1721da112707SJunchao Zhang PetscBool flg; 1722da112707SJunchao Zhang 1723da112707SJunchao Zhang PetscFunctionBegin; 1724da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1725da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1726da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1727da112707SJunchao Zhang } 1728da112707SJunchao Zhang 1729da112707SJunchao Zhang /* Copy A's value to fact */ 1730da112707SJunchao Zhang m = fact->rmap->n; 1731da112707SJunchao Zhang nz = aij->nz; 1732da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1733da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1734da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1735da112707SJunchao Zhang 1736bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeBegin()); 1737da112707SJunchao Zhang /* Factorize fact inplace */ 17389371c9d4SSatish Balay if (m) 17399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1740d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1741da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1742da112707SJunchao Zhang int numerical_zero; 1743da112707SJunchao Zhang cusparseStatus_t status; 1744da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1745da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1746da112707SJunchao Zhang } 1747da112707SJunchao Zhang 1748*204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1749*204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 1750*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1751*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1752*204a0e31SJunchao Zhang } else 1753*204a0e31SJunchao Zhang #endif 1754*204a0e31SJunchao Zhang { 175512ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 175612ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 175712ba2bc6SJunchao Zhang */ 17589371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1759da112707SJunchao Zhang 17609371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1761da112707SJunchao Zhang 1762*204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 176312ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 176412ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1765*204a0e31SJunchao Zhang } 176612ba2bc6SJunchao Zhang 1767da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1768d460d7bfSJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1769d460d7bfSJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1770da112707SJunchao Zhang fact->ops->matsolve = NULL; 1771da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1772bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeEnd()); 1773da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 17743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1775da112707SJunchao Zhang } 1776da112707SJunchao Zhang 17778eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1778d71ae5a4SJacob Faibussowitsch { 1779da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1780da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1781da112707SJunchao Zhang PetscInt m, nz; 1782da112707SJunchao Zhang 1783da112707SJunchao Zhang PetscFunctionBegin; 1784da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1785da112707SJunchao Zhang PetscInt i; 1786da112707SJunchao Zhang PetscBool flg, missing; 1787da112707SJunchao Zhang 1788da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1789da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1790da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1791da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1792da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1793da112707SJunchao Zhang } 1794da112707SJunchao Zhang 1795da112707SJunchao Zhang /* Free the old stale stuff */ 1796da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1797da112707SJunchao Zhang 1798da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1799da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1800da112707SJunchao Zhang */ 1801da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1802da112707SJunchao Zhang 1803da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1804da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1805da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1806da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1807da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1808da112707SJunchao Zhang 1809da112707SJunchao Zhang aij->row = NULL; 1810da112707SJunchao Zhang aij->col = NULL; 1811da112707SJunchao Zhang 1812da112707SJunchao Zhang /* ====================================================================== */ 1813da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1814da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1815da112707SJunchao Zhang /* ====================================================================== */ 1816da112707SJunchao Zhang const int *Ai, *Aj; 1817da112707SJunchao Zhang 1818da112707SJunchao Zhang m = fact->rmap->n; 1819da112707SJunchao Zhang nz = aij->nz; 1820da112707SJunchao Zhang 1821f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1822f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1823f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1824d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1825d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1826d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1827da112707SJunchao Zhang 1828da112707SJunchao Zhang /* ====================================================================== */ 1829da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1830da112707SJunchao Zhang /* ====================================================================== */ 1831da112707SJunchao Zhang cusparseFillMode_t fillMode; 1832da112707SJunchao Zhang cusparseDiagType_t diagType; 1833da112707SJunchao Zhang 1834da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1835da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1836da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1837da112707SJunchao Zhang 1838da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1839da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1840da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1841da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1842da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1843da112707SJunchao Zhang */ 1844da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1845da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1846d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18479371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18489371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1849da112707SJunchao Zhang 1850da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1851da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1852d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18539371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18549371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1855da112707SJunchao Zhang 1856da112707SJunchao Zhang /* ========================================================================= */ 1857da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1858da112707SJunchao Zhang /* ========================================================================= */ 1859da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 18609371c9d4SSatish Balay if (m) 18619371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1862d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1863da112707SJunchao Zhang 1864da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1865da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1866da112707SJunchao Zhang 1867da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1868da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1869da112707SJunchao Zhang 1870da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18719371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1872da112707SJunchao Zhang 1873da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 18749371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1875da112707SJunchao Zhang 1876da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 187712ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 187812ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 187912ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1880da112707SJunchao Zhang */ 188112ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 188212ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 188312ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1884da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 188512ba2bc6SJunchao Zhang } else { 188612ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 188712ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1888da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 188912ba2bc6SJunchao Zhang } 1890da112707SJunchao Zhang 1891da112707SJunchao Zhang /* ========================================================================== */ 1892da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1893da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1894da112707SJunchao Zhang /* ========================================================================== */ 1895da112707SJunchao Zhang int structural_zero; 1896da112707SJunchao Zhang cusparseStatus_t status; 1897da112707SJunchao Zhang 1898da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 18999371c9d4SSatish Balay if (m) 19009371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1901d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1902da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1903da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1904da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1905da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1906da112707SJunchao Zhang } 1907da112707SJunchao Zhang 1908da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 19090dd8c0acSJunchao Zhang { 1910da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 19110dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1912da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1913da112707SJunchao Zhang 1914da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1915da112707SJunchao Zhang Ai = Aseq->i; 1916da112707SJunchao Zhang Adiag = Aseq->diag; 1917da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1918da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1919da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1920da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1921da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1922da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1923da112707SJunchao Zhang */ 1924da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1925da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1926da112707SJunchao Zhang } 1927da112707SJunchao Zhang } 1928da112707SJunchao Zhang fs->numericFactFlops = flops; 19290dd8c0acSJunchao Zhang } 1930da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 19313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1932da112707SJunchao Zhang } 1933da112707SJunchao Zhang 1934d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1935d71ae5a4SJacob Faibussowitsch { 1936da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1937da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1938da112707SJunchao Zhang const PetscScalar *barray; 1939da112707SJunchao Zhang PetscScalar *xarray; 1940da112707SJunchao Zhang 1941da112707SJunchao Zhang PetscFunctionBegin; 1942da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1943da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1944da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1945da112707SJunchao Zhang 1946da112707SJunchao Zhang /* Solve L*y = b */ 1947da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1948da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 19499371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 19509371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1951da112707SJunchao Zhang 1952da112707SJunchao Zhang /* Solve Lt*x = y */ 1953da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 19549371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 19559371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1956da112707SJunchao Zhang 1957da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1958da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1959da112707SJunchao Zhang 1960da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1961da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 19623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1963da112707SJunchao Zhang } 1964da112707SJunchao Zhang 19658eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1966d71ae5a4SJacob Faibussowitsch { 1967da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1968da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1969da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1970da112707SJunchao Zhang CsrMatrix *Acsr; 1971da112707SJunchao Zhang PetscInt m, nz; 1972da112707SJunchao Zhang PetscBool flg; 1973da112707SJunchao Zhang 1974da112707SJunchao Zhang PetscFunctionBegin; 1975da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1976da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1977da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1978da112707SJunchao Zhang } 1979da112707SJunchao Zhang 1980da112707SJunchao Zhang /* Copy A's value to fact */ 1981da112707SJunchao Zhang m = fact->rmap->n; 1982da112707SJunchao Zhang nz = aij->nz; 1983da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1984da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1985da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1986da112707SJunchao Zhang 1987da112707SJunchao Zhang /* Factorize fact inplace */ 1988da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1989da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1990da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1991da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1992da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1993da112707SJunchao Zhang */ 1994d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1995da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1996da112707SJunchao Zhang int numerical_zero; 1997da112707SJunchao Zhang cusparseStatus_t status; 1998da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1999da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 2000da112707SJunchao Zhang } 2001da112707SJunchao Zhang 2002*204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 2003*204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 2004*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2005*204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 2006*204a0e31SJunchao Zhang } else 2007*204a0e31SJunchao Zhang #endif 2008*204a0e31SJunchao Zhang { 20099371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 2010da112707SJunchao Zhang 2011da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 2012da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 2013da112707SJunchao Zhang */ 20149371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 2015*204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 2016*204a0e31SJunchao Zhang } 2017da112707SJunchao Zhang 2018da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 2019da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 2020da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 2021da112707SJunchao Zhang fact->ops->matsolve = NULL; 2022da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 2023da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 20243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2025da112707SJunchao Zhang } 2026da112707SJunchao Zhang 20278eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2028d71ae5a4SJacob Faibussowitsch { 2029da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2030da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2031da112707SJunchao Zhang PetscInt m, nz; 2032da112707SJunchao Zhang 2033da112707SJunchao Zhang PetscFunctionBegin; 2034da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2035da112707SJunchao Zhang PetscInt i; 2036da112707SJunchao Zhang PetscBool flg, missing; 2037da112707SJunchao Zhang 2038da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2039da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2040da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2041da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 2042da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2043da112707SJunchao Zhang } 2044da112707SJunchao Zhang 2045da112707SJunchao Zhang /* Free the old stale stuff */ 2046da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2047da112707SJunchao Zhang 2048da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2049da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2050da112707SJunchao Zhang */ 2051da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2052da112707SJunchao Zhang 2053da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2054da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2055da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2056da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2057da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2058da112707SJunchao Zhang 2059da112707SJunchao Zhang aij->row = NULL; 2060da112707SJunchao Zhang aij->col = NULL; 2061da112707SJunchao Zhang 2062da112707SJunchao Zhang /* ====================================================================== */ 2063da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2064da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2065da112707SJunchao Zhang /* ====================================================================== */ 2066da112707SJunchao Zhang const int *Ai, *Aj; 2067da112707SJunchao Zhang 2068da112707SJunchao Zhang m = fact->rmap->n; 2069da112707SJunchao Zhang nz = aij->nz; 2070da112707SJunchao Zhang 2071f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2072f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2073da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2074da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2075d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2076d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2077da112707SJunchao Zhang 2078da112707SJunchao Zhang /* ====================================================================== */ 2079da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2080da112707SJunchao Zhang /* ====================================================================== */ 2081da112707SJunchao Zhang cusparseFillMode_t fillMode; 2082da112707SJunchao Zhang cusparseDiagType_t diagType; 2083da112707SJunchao Zhang 2084da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2085da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2086da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2087da112707SJunchao Zhang 2088da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2089da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2090da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2091da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2092da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2093da112707SJunchao Zhang */ 2094da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2095da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2096d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 20979371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 20989371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2099da112707SJunchao Zhang 2100da112707SJunchao Zhang /* ========================================================================= */ 2101da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2102da112707SJunchao Zhang /* ========================================================================= */ 2103da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2104d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2105da112707SJunchao Zhang 2106da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2107da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2108da112707SJunchao Zhang 2109da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2110da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2111da112707SJunchao Zhang 2112da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 21139371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2114da112707SJunchao Zhang 2115da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 21169371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2117da112707SJunchao Zhang 211812ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 211912ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 212012ba2bc6SJunchao Zhang */ 212112ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 212212ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 212312ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 2124da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 212512ba2bc6SJunchao Zhang } else { 212612ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 212712ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 212812ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 212912ba2bc6SJunchao Zhang } 2130da112707SJunchao Zhang 2131da112707SJunchao Zhang /* ========================================================================== */ 2132da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2133da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2134da112707SJunchao Zhang /* ========================================================================== */ 2135da112707SJunchao Zhang int structural_zero; 2136da112707SJunchao Zhang cusparseStatus_t status; 2137da112707SJunchao Zhang 2138da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2139d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2140da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2141da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2142da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2143da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2144da112707SJunchao Zhang } 2145da112707SJunchao Zhang 2146da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 21470dd8c0acSJunchao Zhang { 2148da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 21490dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 2150da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2151da112707SJunchao Zhang 2152da112707SJunchao Zhang Ai = Aseq->i; 2153da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2154da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 2155da112707SJunchao Zhang if (nzRow > 1) { 2156da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2157da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2158da112707SJunchao Zhang */ 2159da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 2160da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2161da112707SJunchao Zhang } 2162da112707SJunchao Zhang } 2163da112707SJunchao Zhang fs->numericFactFlops = flops; 21640dd8c0acSJunchao Zhang } 2165da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 21663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2167da112707SJunchao Zhang } 2168da112707SJunchao Zhang #endif 2169da112707SJunchao Zhang 2170d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2171d460d7bfSJunchao Zhang { 2172b820271fSJunchao Zhang // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2173b820271fSJunchao Zhang Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2174d460d7bfSJunchao Zhang 2175d460d7bfSJunchao Zhang PetscFunctionBegin; 2176d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2177d460d7bfSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2178d460d7bfSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 2179d460d7bfSJunchao Zhang 2180d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) { 2181b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2182d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2183d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2184d460d7bfSJunchao Zhang #else 2185d460d7bfSJunchao Zhang /* determine which version of MatSolve needs to be used. */ 2186d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2187d460d7bfSJunchao Zhang IS isrow = b->row, iscol = b->col; 2188d460d7bfSJunchao Zhang PetscBool row_identity, col_identity; 2189d460d7bfSJunchao Zhang 2190d460d7bfSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2191d460d7bfSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2192d460d7bfSJunchao Zhang if (row_identity && col_identity) { 2193d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2194d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2195d460d7bfSJunchao Zhang } else { 2196d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2197d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2198d460d7bfSJunchao Zhang } 2199d460d7bfSJunchao Zhang #endif 2200d460d7bfSJunchao Zhang } 2201d460d7bfSJunchao Zhang B->ops->matsolve = NULL; 2202d460d7bfSJunchao Zhang B->ops->matsolvetranspose = NULL; 2203d460d7bfSJunchao Zhang 2204d460d7bfSJunchao Zhang /* get the triangular factors */ 2205d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2206d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2207d460d7bfSJunchao Zhang } 2208d460d7bfSJunchao Zhang 2209d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2210d460d7bfSJunchao Zhang { 2211d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2212d460d7bfSJunchao Zhang 2213d460d7bfSJunchao Zhang PetscFunctionBegin; 2214d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2215d460d7bfSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2216d460d7bfSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2217d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2218d460d7bfSJunchao Zhang } 2219d460d7bfSJunchao Zhang 2220d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2221d71ae5a4SJacob Faibussowitsch { 2222da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2223da112707SJunchao Zhang 2224da112707SJunchao Zhang PetscFunctionBegin; 2225b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2226bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2227f82ac72cSJunchao Zhang if (!info->factoronhost) { 2228da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2229da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2230bc996fdcSJunchao Zhang } 2231da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2232da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2233da112707SJunchao Zhang } else 2234da112707SJunchao Zhang #endif 2235da112707SJunchao Zhang { 2236da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2237da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2238da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2239da112707SJunchao Zhang } 22403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2241da112707SJunchao Zhang } 2242da112707SJunchao Zhang 2243d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2244d71ae5a4SJacob Faibussowitsch { 2245da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2246da112707SJunchao Zhang 2247da112707SJunchao Zhang PetscFunctionBegin; 2248b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2249bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 2250f82ac72cSJunchao Zhang if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2251da112707SJunchao Zhang if (!info->levels && perm_identity) { 2252da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2253da112707SJunchao Zhang } else 2254da112707SJunchao Zhang #endif 2255da112707SJunchao Zhang { 2256da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2257da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2258da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2259da112707SJunchao Zhang } 22603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2261da112707SJunchao Zhang } 2262da112707SJunchao Zhang 2263d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2264d71ae5a4SJacob Faibussowitsch { 2265da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2266da112707SJunchao Zhang 2267da112707SJunchao Zhang PetscFunctionBegin; 2268da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2269da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2270da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 22713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2272da112707SJunchao Zhang } 2273da112707SJunchao Zhang 227466976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2275d71ae5a4SJacob Faibussowitsch { 2276841d4cb1SJunchao Zhang PetscFunctionBegin; 2277841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 22783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2279841d4cb1SJunchao Zhang } 2280841d4cb1SJunchao Zhang 2281841d4cb1SJunchao Zhang /*MC 2282841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 228311a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2284841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2285841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 228611a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2287841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2288841d4cb1SJunchao Zhang 2289841d4cb1SJunchao Zhang Level: beginner 2290841d4cb1SJunchao Zhang 22911cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 22922ef1f0ffSBarry Smith `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2293841d4cb1SJunchao Zhang M*/ 2294841d4cb1SJunchao Zhang 2295d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2296d71ae5a4SJacob Faibussowitsch { 2297841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2298841d4cb1SJunchao Zhang 2299841d4cb1SJunchao Zhang PetscFunctionBegin; 2300841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2301841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2302b820271fSJunchao Zhang (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2303841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2304841d4cb1SJunchao Zhang 2305841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2306841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2307841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2308841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2309841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2310841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2311841d4cb1SJunchao Zhang } else { 2312841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2313841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2314841d4cb1SJunchao Zhang } 2315841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2316841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2317841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2318841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2319841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2320841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2321841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2322841d4cb1SJunchao Zhang } else { 2323841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2324841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2325841d4cb1SJunchao Zhang } 2326841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2327841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2328841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2329841d4cb1SJunchao Zhang 2330841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2331841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2332f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 23333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2334841d4cb1SJunchao Zhang } 2335841d4cb1SJunchao Zhang 2336d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2337d71ae5a4SJacob Faibussowitsch { 23387e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 23397e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2340b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2341da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 23420dd8c0acSJunchao Zhang #endif 23437e8381f9SStefano Zampini 23447e8381f9SStefano Zampini PetscFunctionBegin; 23457e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 23469566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2347da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2348da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 23499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2350da112707SJunchao Zhang } 2351b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2352da112707SJunchao Zhang else if (fs->csrVal) { 2353da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2354da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2355da112707SJunchao Zhang } 2356da112707SJunchao Zhang #endif 23579371c9d4SSatish Balay else 23589371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 23599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 23609566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 23617e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 23627e8381f9SStefano Zampini } 23633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23647e8381f9SStefano Zampini } 23657e8381f9SStefano Zampini 2366d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2367d71ae5a4SJacob Faibussowitsch { 23687e8381f9SStefano Zampini PetscFunctionBegin; 23699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 237067a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 237267a45760SJunchao Zhang } 237367a45760SJunchao Zhang 2374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2375d71ae5a4SJacob Faibussowitsch { 237667a45760SJunchao Zhang PetscFunctionBegin; 23777e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 237867a45760SJunchao Zhang *array = NULL; 23793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 238067a45760SJunchao Zhang } 238167a45760SJunchao Zhang 2382d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2383d71ae5a4SJacob Faibussowitsch { 238467a45760SJunchao Zhang PetscFunctionBegin; 23859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 238667a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 238867a45760SJunchao Zhang } 238967a45760SJunchao Zhang 23908eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2391d71ae5a4SJacob Faibussowitsch { 239267a45760SJunchao Zhang PetscFunctionBegin; 239367a45760SJunchao Zhang *array = NULL; 23943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 239567a45760SJunchao Zhang } 239667a45760SJunchao Zhang 2397d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2398d71ae5a4SJacob Faibussowitsch { 239967a45760SJunchao Zhang PetscFunctionBegin; 240067a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 24013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 240267a45760SJunchao Zhang } 240367a45760SJunchao Zhang 2404d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2405d71ae5a4SJacob Faibussowitsch { 240667a45760SJunchao Zhang PetscFunctionBegin; 240767a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 240867a45760SJunchao Zhang *array = NULL; 24093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24107e8381f9SStefano Zampini } 24117e8381f9SStefano Zampini 2412d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2413d71ae5a4SJacob Faibussowitsch { 24147ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 24157ee59b9bSJunchao Zhang CsrMatrix *matrix; 24167ee59b9bSJunchao Zhang 24177ee59b9bSJunchao Zhang PetscFunctionBegin; 24187ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 24197ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 24207ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 24217ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 24227ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 24237ee59b9bSJunchao Zhang 24247ee59b9bSJunchao Zhang if (i) { 24257ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24267ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 24277ee59b9bSJunchao Zhang #else 24287ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24297ee59b9bSJunchao Zhang #endif 24307ee59b9bSJunchao Zhang } 24317ee59b9bSJunchao Zhang if (j) { 24327ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24337ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 24347ee59b9bSJunchao Zhang #else 24357ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24367ee59b9bSJunchao Zhang #endif 24377ee59b9bSJunchao Zhang } 24387ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 24397ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 24403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24417ee59b9bSJunchao Zhang } 24427ee59b9bSJunchao Zhang 2443d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2444d71ae5a4SJacob Faibussowitsch { 2445aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 24467c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 24479ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2448213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2449aa372e3fSPaul Mullowney cusparseStatus_t stat; 2450abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 24519ae82921SPaul Mullowney 24529ae82921SPaul Mullowney PetscFunctionBegin; 245328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2454c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2455a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2456a49f1ed0SStefano Zampini CsrMatrix *matrix; 2457afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 245885ba7357SStefano Zampini 245908401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 24609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2461afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 24629566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 2463f4f49eeaSPierre Jolivet PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 24649566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 246634d6c7a5SJose E. Roman } else { 2467abb89eb1SStefano Zampini PetscInt nnz; 24689566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 24709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 24717c700b8dSJunchao Zhang delete cusparsestruct->workVector; 247281902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2473a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2474a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 24759ae82921SPaul Mullowney try { 24769ae82921SPaul Mullowney if (a->compressedrow.use) { 24779ae82921SPaul Mullowney m = a->compressedrow.nrows; 24789ae82921SPaul Mullowney ii = a->compressedrow.i; 24799ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 24809ae82921SPaul Mullowney } else { 2481213423ffSJunchao Zhang m = A->rmap->n; 2482213423ffSJunchao Zhang ii = a->i; 2483e6e9a74fSStefano Zampini ridx = NULL; 24849ae82921SPaul Mullowney } 248508401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 24869371c9d4SSatish Balay if (!a->a) { 24879371c9d4SSatish Balay nnz = ii[m]; 24889371c9d4SSatish Balay both = PETSC_FALSE; 24899371c9d4SSatish Balay } else nnz = a->nz; 249008401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 24919ae82921SPaul Mullowney 249285ba7357SStefano Zampini /* create cusparse matrix */ 2493abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2494aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 24959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 24969566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 24979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 24989ae82921SPaul Mullowney 2499f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2500f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2501f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 25029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 25039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 25049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 25059566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2506b06137fdSPaul Mullowney 2507aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2508aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2509aa372e3fSPaul Mullowney /* set the matrix */ 2510afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2511afb2bd1cSJunchao Zhang mat->num_rows = m; 2512afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2513abb89eb1SStefano Zampini mat->num_entries = nnz; 2514ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2515afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 25169ae82921SPaul Mullowney 2517ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2518abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2519aa372e3fSPaul Mullowney 2520ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2521abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2522aa372e3fSPaul Mullowney 2523aa372e3fSPaul Mullowney /* assign the pointer */ 2524afb2bd1cSJunchao Zhang matstruct->mat = mat; 2525afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2526afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 25279371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 25289371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 25299371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2530afb2bd1cSJunchao Zhang } 2531afb2bd1cSJunchao Zhang #endif 2532aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2533afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2534afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2535afb2bd1cSJunchao Zhang #else 2536afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2537afb2bd1cSJunchao Zhang mat->num_rows = m; 2538afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2539abb89eb1SStefano Zampini mat->num_entries = nnz; 2540ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2541afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2542aa372e3fSPaul Mullowney 2543ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2544abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2545aa372e3fSPaul Mullowney 2546ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2547abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2548aa372e3fSPaul Mullowney 2549aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 25509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 25519371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 25529371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 25539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2554aa372e3fSPaul Mullowney /* assign the pointer */ 2555aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2556aa372e3fSPaul Mullowney 2557afb2bd1cSJunchao Zhang if (mat) { 2558afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2559afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2560afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2561afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2562087f3262SPaul Mullowney } 2563afb2bd1cSJunchao Zhang #endif 2564087f3262SPaul Mullowney } 2565ca45077fSPaul Mullowney 2566aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2567213423ffSJunchao Zhang if (a->compressedrow.use) { 2568ee477ddbSJunchao Zhang PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2569ee477ddbSJunchao Zhang PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2570aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2571213423ffSJunchao Zhang tmp = m; 2572213423ffSJunchao Zhang } else { 2573213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2574213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2575213423ffSJunchao Zhang tmp = 0; 2576213423ffSJunchao Zhang } 25779566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2578aa372e3fSPaul Mullowney 2579aa372e3fSPaul Mullowney /* assign the pointer */ 2580aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 2581d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 2582d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2583d71ae5a4SJacob Faibussowitsch } 25849566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 25859566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 258634d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 258734d6c7a5SJose E. Roman } 2588abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 25899ae82921SPaul Mullowney } 25903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25919ae82921SPaul Mullowney } 25929ae82921SPaul Mullowney 25939371c9d4SSatish Balay struct VecCUDAPlusEquals { 2594aa372e3fSPaul Mullowney template <typename Tuple> 2595d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2596d71ae5a4SJacob Faibussowitsch { 2597aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2598aa372e3fSPaul Mullowney } 2599aa372e3fSPaul Mullowney }; 2600aa372e3fSPaul Mullowney 26019371c9d4SSatish Balay struct VecCUDAEquals { 26027e8381f9SStefano Zampini template <typename Tuple> 2603d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2604d71ae5a4SJacob Faibussowitsch { 26057e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 26067e8381f9SStefano Zampini } 26077e8381f9SStefano Zampini }; 26087e8381f9SStefano Zampini 26099371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2610e6e9a74fSStefano Zampini template <typename Tuple> 2611d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2612d71ae5a4SJacob Faibussowitsch { 2613e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2614e6e9a74fSStefano Zampini } 2615e6e9a74fSStefano Zampini }; 2616e6e9a74fSStefano Zampini 2617afb2bd1cSJunchao Zhang struct MatMatCusparse { 2618ccdfe979SStefano Zampini PetscBool cisdense; 2619ccdfe979SStefano Zampini PetscScalar *Bt; 2620ccdfe979SStefano Zampini Mat X; 2621fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2622fcdce8c4SStefano Zampini PetscLogDouble flops; 2623fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2624b4285af6SJunchao Zhang 2625afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2626fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2627afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2628afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2629afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2630afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2631b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2632b4285af6SJunchao Zhang void *dBuffer4; 2633b4285af6SJunchao Zhang void *dBuffer5; 2634b4285af6SJunchao Zhang #endif 2635fcdce8c4SStefano Zampini size_t mmBufferSize; 2636fcdce8c4SStefano Zampini void *mmBuffer; 2637fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2638fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2639afb2bd1cSJunchao Zhang #endif 2640afb2bd1cSJunchao Zhang }; 2641ccdfe979SStefano Zampini 2642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2643d71ae5a4SJacob Faibussowitsch { 2644ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2645ccdfe979SStefano Zampini 2646ccdfe979SStefano Zampini PetscFunctionBegin; 26479566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2648fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2649afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 26509566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 26519566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 26529566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 26539566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2654b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 26559566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 26569566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2657b4285af6SJunchao Zhang #endif 26589566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 26599566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2660afb2bd1cSJunchao Zhang #endif 26619566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 26629566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 26633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2664ccdfe979SStefano Zampini } 2665ccdfe979SStefano Zampini 26664742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2667ccdfe979SStefano Zampini 2668d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2669d71ae5a4SJacob Faibussowitsch { 2670ccdfe979SStefano Zampini Mat_Product *product = C->product; 2671ccdfe979SStefano Zampini Mat A, B; 2672afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2673ccdfe979SStefano Zampini PetscBool flg, biscuda; 2674ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2675ccdfe979SStefano Zampini cusparseStatus_t stat; 2676ccdfe979SStefano Zampini cusparseOperation_t opA; 2677ccdfe979SStefano Zampini const PetscScalar *barray; 2678ccdfe979SStefano Zampini PetscScalar *carray; 2679ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2680ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2681ccdfe979SStefano Zampini CsrMatrix *csrmat; 2682ccdfe979SStefano Zampini 2683ccdfe979SStefano Zampini PetscFunctionBegin; 2684ccdfe979SStefano Zampini MatCheckProduct(C, 1); 268528b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2686ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2687ccdfe979SStefano Zampini A = product->A; 2688ccdfe979SStefano Zampini B = product->B; 26899566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 269028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2691ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2692ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 269328b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 26949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2695ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2696ccdfe979SStefano Zampini switch (product->type) { 2697ccdfe979SStefano Zampini case MATPRODUCT_AB: 2698ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2699ccdfe979SStefano Zampini mat = cusp->mat; 2700ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2701ccdfe979SStefano Zampini m = A->rmap->n; 2702ccdfe979SStefano Zampini n = B->cmap->n; 2703ccdfe979SStefano Zampini break; 2704ccdfe979SStefano Zampini case MATPRODUCT_AtB: 27051a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2706e6e9a74fSStefano Zampini mat = cusp->mat; 2707e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2708e6e9a74fSStefano Zampini } else { 27099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2710ccdfe979SStefano Zampini mat = cusp->matTranspose; 2711ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2712e6e9a74fSStefano Zampini } 2713ccdfe979SStefano Zampini m = A->cmap->n; 2714ccdfe979SStefano Zampini n = B->cmap->n; 2715ccdfe979SStefano Zampini break; 2716ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2717ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2718ccdfe979SStefano Zampini mat = cusp->mat; 2719ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2720ccdfe979SStefano Zampini m = A->rmap->n; 2721ccdfe979SStefano Zampini n = B->rmap->n; 2722ccdfe979SStefano Zampini break; 2723d71ae5a4SJacob Faibussowitsch default: 2724d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2725ccdfe979SStefano Zampini } 272628b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2727ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2728ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 27299566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 27309566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2731cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2732afb2bd1cSJunchao Zhang 27339566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2734c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2735cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 27369566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2737c8378d12SStefano Zampini } else { 2738cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 27399566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2740c8378d12SStefano Zampini } 2741c8378d12SStefano Zampini 27429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2743afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2744afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2745fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2746fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2747fe5544b9SJunchao Zhang #else 2748fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr; 2749fe5544b9SJunchao Zhang #endif 2750fe5544b9SJunchao Zhang 2751a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2752afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2753fcdce8c4SStefano Zampini size_t mmBufferSize; 27549371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 27559371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 27569371c9d4SSatish Balay mmdata->matBDescr = NULL; 27579371c9d4SSatish Balay } 2758afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 27599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2760afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2761afb2bd1cSJunchao Zhang } 2762c8378d12SStefano Zampini 27639371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 27649371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 27659371c9d4SSatish Balay mmdata->matCDescr = NULL; 27669371c9d4SSatish Balay } 2767afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 27689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2769afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2770afb2bd1cSJunchao Zhang } 2771afb2bd1cSJunchao Zhang 2772fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2773fe5544b9SJunchao Zhang if (matADescr) { 277417f5f06fSJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2775fe5544b9SJunchao Zhang matADescr = NULL; 2776fe5544b9SJunchao Zhang } 2777fe5544b9SJunchao Zhang #endif 2778fe5544b9SJunchao Zhang 2779fe5544b9SJunchao Zhang if (!matADescr) { 2780fe5544b9SJunchao Zhang stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 27819371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 27829371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2783afb2bd1cSJunchao Zhang } 2784fe5544b9SJunchao Zhang 2785fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2786fe5544b9SJunchao Zhang 2787fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 27889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 27899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2790fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2791fcdce8c4SStefano Zampini } 2792fe5544b9SJunchao Zhang 2793fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0 2794fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2795fe5544b9SJunchao Zhang #endif 2796fe5544b9SJunchao Zhang 2797afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2798afb2bd1cSJunchao Zhang } else { 2799afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2800fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 28019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 28029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2803afb2bd1cSJunchao Zhang } 2804afb2bd1cSJunchao Zhang 2805afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2806fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2807afb2bd1cSJunchao Zhang #else 2808afb2bd1cSJunchao Zhang PetscInt k; 2809afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2810ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2811ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2812ccdfe979SStefano Zampini cublasStatus_t cerr; 2813ccdfe979SStefano Zampini 28149566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 28159371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 28169371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2817ccdfe979SStefano Zampini blda = B->cmap->n; 2818afb2bd1cSJunchao Zhang k = B->cmap->n; 2819afb2bd1cSJunchao Zhang } else { 2820afb2bd1cSJunchao Zhang k = B->rmap->n; 2821ccdfe979SStefano Zampini } 2822ccdfe979SStefano Zampini 2823afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 28249371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 28259371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2826afb2bd1cSJunchao Zhang #endif 28279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 28289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2829cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2830ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2831cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 28324742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2833ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2834cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 28354742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2836ccdfe979SStefano Zampini } else { 2837cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2838ccdfe979SStefano Zampini } 283948a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 284048a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 28413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2842ccdfe979SStefano Zampini } 2843ccdfe979SStefano Zampini 2844d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2845d71ae5a4SJacob Faibussowitsch { 2846ccdfe979SStefano Zampini Mat_Product *product = C->product; 2847ccdfe979SStefano Zampini Mat A, B; 2848ccdfe979SStefano Zampini PetscInt m, n; 2849ccdfe979SStefano Zampini PetscBool cisdense, flg; 2850ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2851ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2852ccdfe979SStefano Zampini 2853ccdfe979SStefano Zampini PetscFunctionBegin; 2854ccdfe979SStefano Zampini MatCheckProduct(C, 1); 285528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2856ccdfe979SStefano Zampini A = product->A; 2857ccdfe979SStefano Zampini B = product->B; 28589566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 285928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2860ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 286108401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2862ccdfe979SStefano Zampini switch (product->type) { 2863ccdfe979SStefano Zampini case MATPRODUCT_AB: 2864ccdfe979SStefano Zampini m = A->rmap->n; 2865ccdfe979SStefano Zampini n = B->cmap->n; 28660e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2867ccdfe979SStefano Zampini break; 2868ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2869ccdfe979SStefano Zampini m = A->cmap->n; 2870ccdfe979SStefano Zampini n = B->cmap->n; 28710e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 28720e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2873ccdfe979SStefano Zampini break; 2874ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2875ccdfe979SStefano Zampini m = A->rmap->n; 2876ccdfe979SStefano Zampini n = B->rmap->n; 28770e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 28780e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2879ccdfe979SStefano Zampini break; 2880ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2881ccdfe979SStefano Zampini m = B->cmap->n; 2882ccdfe979SStefano Zampini n = B->cmap->n; 28830e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 28840e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2885ccdfe979SStefano Zampini break; 2886ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2887ccdfe979SStefano Zampini m = B->rmap->n; 2888ccdfe979SStefano Zampini n = B->rmap->n; 28890e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 28900e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2891ccdfe979SStefano Zampini break; 2892d71ae5a4SJacob Faibussowitsch default: 2893d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2894ccdfe979SStefano Zampini } 28959566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2896ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 28979566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 28989566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2899ccdfe979SStefano Zampini 2900ccdfe979SStefano Zampini /* product data */ 29019566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2902ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2903afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2904afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 290548a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2906afb2bd1cSJunchao Zhang #endif 2907ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2908ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 29099566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 29109566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2911ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 29129566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2913ccdfe979SStefano Zampini } else { 29149566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2915ccdfe979SStefano Zampini } 2916ccdfe979SStefano Zampini } 2917ccdfe979SStefano Zampini C->product->data = mmdata; 2918ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2919ccdfe979SStefano Zampini 2920ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 29213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2922ccdfe979SStefano Zampini } 2923ccdfe979SStefano Zampini 2924d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2925d71ae5a4SJacob Faibussowitsch { 2926ccdfe979SStefano Zampini Mat_Product *product = C->product; 2927fcdce8c4SStefano Zampini Mat A, B; 2928fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2929fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2930fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2931fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2932fcdce8c4SStefano Zampini PetscBool flg; 2933fcdce8c4SStefano Zampini cusparseStatus_t stat; 2934fcdce8c4SStefano Zampini MatProductType ptype; 2935fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2936fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2937fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2938fcdce8c4SStefano Zampini #endif 2939b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2940ccdfe979SStefano Zampini 2941ccdfe979SStefano Zampini PetscFunctionBegin; 2942ccdfe979SStefano Zampini MatCheckProduct(C, 1); 294328b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 29449566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 294528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2946fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2947fcdce8c4SStefano Zampini A = product->A; 2948fcdce8c4SStefano Zampini B = product->B; 2949fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2950fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2951fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 295208401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2953fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 295428b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2955fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 295628b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2957fcdce8c4SStefano Zampini goto finalize; 2958fcdce8c4SStefano Zampini } 2959fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 29609566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 296128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 29629566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 296328b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 296428b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 296528b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2966fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2967fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2968fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 296908401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 297008401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 297108401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 29729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 29739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2974fcdce8c4SStefano Zampini 2975fcdce8c4SStefano Zampini ptype = product->type; 2976b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2977fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 297828b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2979fa046f9fSJunchao Zhang } 2980b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2981fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 298228b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2983fa046f9fSJunchao Zhang } 2984fcdce8c4SStefano Zampini switch (ptype) { 2985fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2986fcdce8c4SStefano Zampini Amat = Acusp->mat; 2987fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2988fcdce8c4SStefano Zampini break; 2989fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2990fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2991fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2992fcdce8c4SStefano Zampini break; 2993fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2994fcdce8c4SStefano Zampini Amat = Acusp->mat; 2995fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2996fcdce8c4SStefano Zampini break; 2997d71ae5a4SJacob Faibussowitsch default: 2998d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2999fcdce8c4SStefano Zampini } 3000fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 300128b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 300228b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 300328b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 3004fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 3005fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 3006fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 300728b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 300828b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 300928b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 30109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3011fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3012fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 30139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3014b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 30159371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30169371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3017b4285af6SJunchao Zhang #else 30189371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 30199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30209371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30219371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3022b4285af6SJunchao Zhang #endif 3023fcdce8c4SStefano Zampini #else 30249371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30259371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 30269371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3027fcdce8c4SStefano Zampini #endif 30289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 30299566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 30309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3031fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3032fcdce8c4SStefano Zampini finalize: 3033fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 30349566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 30359566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 30369566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3037fcdce8c4SStefano Zampini c->reallocs = 0; 3038fcdce8c4SStefano Zampini C->info.mallocs += 0; 3039fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 3040fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 3041fcdce8c4SStefano Zampini C->num_ass++; 30423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3043ccdfe979SStefano Zampini } 3044fcdce8c4SStefano Zampini 3045d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3046d71ae5a4SJacob Faibussowitsch { 3047fcdce8c4SStefano Zampini Mat_Product *product = C->product; 3048fcdce8c4SStefano Zampini Mat A, B; 3049fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3050fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 3051fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3052fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 3053fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 3054fcdce8c4SStefano Zampini PetscBool flg; 3055fcdce8c4SStefano Zampini cusparseStatus_t stat; 3056fcdce8c4SStefano Zampini MatProductType ptype; 3057fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3058fcdce8c4SStefano Zampini PetscLogDouble flops; 3059fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 3060fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3061fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3062fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3063fcdce8c4SStefano Zampini #else 3064fcdce8c4SStefano Zampini int cnz; 3065fcdce8c4SStefano Zampini #endif 3066b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3067fcdce8c4SStefano Zampini 3068fcdce8c4SStefano Zampini PetscFunctionBegin; 3069fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 307028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3071fcdce8c4SStefano Zampini A = product->A; 3072fcdce8c4SStefano Zampini B = product->B; 30739566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 307428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 30759566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 307628b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3077fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 3078fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 3079fcdce8c4SStefano Zampini /* product data */ 30809566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3081fcdce8c4SStefano Zampini C->product->data = mmdata; 3082fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3083fcdce8c4SStefano Zampini 30849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 30859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3086d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3087d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 308808401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 308908401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3090d60bce21SJunchao Zhang 3091fcdce8c4SStefano Zampini ptype = product->type; 3092b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3093fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3094fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3095fa046f9fSJunchao Zhang } 3096b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3097fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3098fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3099fa046f9fSJunchao Zhang } 3100fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3101fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3102fcdce8c4SStefano Zampini switch (ptype) { 3103fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3104fcdce8c4SStefano Zampini m = A->rmap->n; 3105fcdce8c4SStefano Zampini n = B->cmap->n; 3106fcdce8c4SStefano Zampini k = A->cmap->n; 3107fcdce8c4SStefano Zampini Amat = Acusp->mat; 3108fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3109fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3110fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3111fcdce8c4SStefano Zampini break; 3112fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3113fcdce8c4SStefano Zampini m = A->cmap->n; 3114fcdce8c4SStefano Zampini n = B->cmap->n; 3115fcdce8c4SStefano Zampini k = A->rmap->n; 31169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3117fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3118fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3119fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3120fcdce8c4SStefano Zampini break; 3121fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3122fcdce8c4SStefano Zampini m = A->rmap->n; 3123fcdce8c4SStefano Zampini n = B->rmap->n; 3124fcdce8c4SStefano Zampini k = A->cmap->n; 31259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3126fcdce8c4SStefano Zampini Amat = Acusp->mat; 3127fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3128fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3129fcdce8c4SStefano Zampini break; 3130d71ae5a4SJacob Faibussowitsch default: 3131d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3132fcdce8c4SStefano Zampini } 3133fcdce8c4SStefano Zampini 3134fcdce8c4SStefano Zampini /* create cusparse matrix */ 31359566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 31369566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3137fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 3138fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3139fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3140fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3141fcdce8c4SStefano Zampini 3142fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3143fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3144fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 31459566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 31469566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3147fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3148fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3149fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3150fcdce8c4SStefano Zampini } else { 3151fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3152fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3153fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3154fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3155fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3156fcdce8c4SStefano Zampini } 3157fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3158fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3159fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3160fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3161fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3162fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 31639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 31649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 31659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3166f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3167f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3168f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 31699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3172fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3173d460d7bfSJunchao Zhang PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3174fcdce8c4SStefano Zampini c->nz = 0; 3175fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3176fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3177fcdce8c4SStefano Zampini goto finalizesym; 3178fcdce8c4SStefano Zampini } 3179fcdce8c4SStefano Zampini 318028b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 318128b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3182fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 3183fcdce8c4SStefano Zampini if (!biscompressed) { 3184fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 3185fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3186fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3187fcdce8c4SStefano Zampini #endif 3188fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3189fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3190fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3191fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3192fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3193fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3194fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3195fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3196fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3197fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3198fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 31999566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3200fcdce8c4SStefano Zampini } 3201fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3202fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3203fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3204fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 32059371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 32069371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3207fcdce8c4SStefano Zampini } 3208fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3209fcdce8c4SStefano Zampini #endif 3210fcdce8c4SStefano Zampini } 321128b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 321228b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3213fcdce8c4SStefano Zampini /* precompute flops count */ 3214fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3215fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3216fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3217fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 3218fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 3219fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3220fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 3221fcdce8c4SStefano Zampini } 3222fcdce8c4SStefano Zampini } 3223fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3224fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3225fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 3226fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3227fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 3228fcdce8c4SStefano Zampini } 3229fcdce8c4SStefano Zampini } else { /* TODO */ 3230fcdce8c4SStefano Zampini flops = 0.; 3231fcdce8c4SStefano Zampini } 3232fcdce8c4SStefano Zampini 3233fcdce8c4SStefano Zampini mmdata->flops = flops; 32349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3235b4285af6SJunchao Zhang 3236fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 32379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 32381ffab3bdSJunchao Zhang // cuda-12.2 requires non-null csrRowOffsets 32391ffab3bdSJunchao Zhang stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 32409371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3242b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3243b4285af6SJunchao Zhang { 3244b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3245b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3246b4285af6SJunchao Zhang */ 3247b4285af6SJunchao Zhang void *dBuffer1 = NULL; 3248b4285af6SJunchao Zhang void *dBuffer2 = NULL; 3249b4285af6SJunchao Zhang void *dBuffer3 = NULL; 3250b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3251b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3252b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3253b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3254b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3255b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3256b4285af6SJunchao Zhang 3257b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 32589371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 32599371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3261b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 32629371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 32639371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3264b4285af6SJunchao Zhang 32659371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 32669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 32689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 32699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 32709371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 32719371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32729566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 32739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3274b4285af6SJunchao Zhang 3275b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 32769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3277b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 3278b4285af6SJunchao Zhang /* allocate matrix C */ 32799371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32809371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32819371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 32829371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3283b4285af6SJunchao Zhang /* update matC with the new pointers */ 32849371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3286b4285af6SJunchao Zhang 32879371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 32889371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 32909371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 32919371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32929566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 32939371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32949371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32959566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3296b4285af6SJunchao Zhang } 3297ae37ee31SJunchao Zhang #else 3298b4285af6SJunchao Zhang size_t bufSize2; 3299fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 33009371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 33019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 33029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3303fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 33049371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 33059371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3306fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 33079371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 33089371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3309fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3310fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3311fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3312fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3313fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 33149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3315fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 33169371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 33179371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3318fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 33199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3320fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 33219371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 33229371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3323fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 33249566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3325fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 33269566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 33279371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 33289371c9d4SSatish Balay PetscCallCUSPARSE(stat); 33299371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 33309371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3331ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3332fcdce8c4SStefano Zampini #else 33339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 33349371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33359371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 33369371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3337fcdce8c4SStefano Zampini c->nz = cnz; 3338fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 33399566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3340fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 33419566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3342fcdce8c4SStefano Zampini 33439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3344fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3345fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3346fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 33479371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33489371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 33499371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3350fcdce8c4SStefano Zampini #endif 33519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 33529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3353fcdce8c4SStefano Zampini finalizesym: 3354fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 33559f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 33569f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3357fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 33587de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3359fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3360fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3361fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3362fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3363fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3364fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3367fcdce8c4SStefano Zampini } else { 3368fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3369fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3372fcdce8c4SStefano Zampini } 3373fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3374fcdce8c4SStefano Zampini PetscInt r = 0; 3375fcdce8c4SStefano Zampini c->i[0] = 0; 3376fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3377fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3378fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3379fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3380fcdce8c4SStefano Zampini } 3381fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3382fcdce8c4SStefano Zampini } 33839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 33849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 33859566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3386fcdce8c4SStefano Zampini c->maxnz = c->nz; 3387fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3388fcdce8c4SStefano Zampini c->rmax = 0; 3389fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3390fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3391fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3392fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3393fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3394fcdce8c4SStefano Zampini } 33959566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 33969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3397fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3398fcdce8c4SStefano Zampini 3399fcdce8c4SStefano Zampini C->nonzerostate++; 34009566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 34019566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3402fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3403fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3404fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3405fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3406fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3407abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3408fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3409fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3410fcdce8c4SStefano Zampini } 3411fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 34123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3413fcdce8c4SStefano Zampini } 3414fcdce8c4SStefano Zampini 3415fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3416fcdce8c4SStefano Zampini 3417fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3418d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3419d71ae5a4SJacob Faibussowitsch { 3420fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3421fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3422fcdce8c4SStefano Zampini 3423fcdce8c4SStefano Zampini PetscFunctionBegin; 3424fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 34259566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 342648a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3427fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3428fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 342948a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3430fcdce8c4SStefano Zampini } 343165e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 343265e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 343365e4b4d4SStefano Zampini switch (product->type) { 343465e4b4d4SStefano Zampini case MATPRODUCT_AB: 343565e4b4d4SStefano Zampini if (product->api_user) { 3436d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 34379566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3438d0609cedSBarry Smith PetscOptionsEnd(); 343965e4b4d4SStefano Zampini } else { 3440d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 34419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3442d0609cedSBarry Smith PetscOptionsEnd(); 344365e4b4d4SStefano Zampini } 344465e4b4d4SStefano Zampini break; 344565e4b4d4SStefano Zampini case MATPRODUCT_AtB: 344665e4b4d4SStefano Zampini if (product->api_user) { 3447d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 34489566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3449d0609cedSBarry Smith PetscOptionsEnd(); 345065e4b4d4SStefano Zampini } else { 3451d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 34529566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3453d0609cedSBarry Smith PetscOptionsEnd(); 345465e4b4d4SStefano Zampini } 345565e4b4d4SStefano Zampini break; 345665e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 345765e4b4d4SStefano Zampini if (product->api_user) { 3458d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 34599566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3460d0609cedSBarry Smith PetscOptionsEnd(); 346165e4b4d4SStefano Zampini } else { 3462d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 34639566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3464d0609cedSBarry Smith PetscOptionsEnd(); 346565e4b4d4SStefano Zampini } 346665e4b4d4SStefano Zampini break; 346765e4b4d4SStefano Zampini case MATPRODUCT_RARt: 346865e4b4d4SStefano Zampini if (product->api_user) { 3469d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 34709566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3471d0609cedSBarry Smith PetscOptionsEnd(); 347265e4b4d4SStefano Zampini } else { 3473d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 34749566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3475d0609cedSBarry Smith PetscOptionsEnd(); 347665e4b4d4SStefano Zampini } 347765e4b4d4SStefano Zampini break; 347865e4b4d4SStefano Zampini case MATPRODUCT_ABC: 347965e4b4d4SStefano Zampini if (product->api_user) { 3480d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 34819566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3482d0609cedSBarry Smith PetscOptionsEnd(); 348365e4b4d4SStefano Zampini } else { 3484d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 34859566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3486d0609cedSBarry Smith PetscOptionsEnd(); 348765e4b4d4SStefano Zampini } 348865e4b4d4SStefano Zampini break; 3489d71ae5a4SJacob Faibussowitsch default: 3490d71ae5a4SJacob Faibussowitsch break; 349165e4b4d4SStefano Zampini } 349265e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 349365e4b4d4SStefano Zampini } 349465e4b4d4SStefano Zampini /* dispatch */ 3495fcdce8c4SStefano Zampini if (isdense) { 3496ccdfe979SStefano Zampini switch (product->type) { 3497ccdfe979SStefano Zampini case MATPRODUCT_AB: 3498ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3499ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3500ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3501ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3502fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 35039566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3504fcdce8c4SStefano Zampini } else { 3505fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3506fcdce8c4SStefano Zampini } 3507fcdce8c4SStefano Zampini break; 3508d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3509d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3510d71ae5a4SJacob Faibussowitsch break; 3511d71ae5a4SJacob Faibussowitsch default: 3512d71ae5a4SJacob Faibussowitsch break; 3513ccdfe979SStefano Zampini } 3514fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3515fcdce8c4SStefano Zampini switch (product->type) { 3516fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3517fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3518d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 3519d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3520d71ae5a4SJacob Faibussowitsch break; 3521fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3522fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3523d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3524d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3525d71ae5a4SJacob Faibussowitsch break; 3526d71ae5a4SJacob Faibussowitsch default: 3527d71ae5a4SJacob Faibussowitsch break; 3528fcdce8c4SStefano Zampini } 3529fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 35309566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3531fcdce8c4SStefano Zampini } 35323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3533ccdfe979SStefano Zampini } 3534ccdfe979SStefano Zampini 3535d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3536d71ae5a4SJacob Faibussowitsch { 35379ae82921SPaul Mullowney PetscFunctionBegin; 35389566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 35393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3540e6e9a74fSStefano Zampini } 3541e6e9a74fSStefano Zampini 3542d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3543d71ae5a4SJacob Faibussowitsch { 3544e6e9a74fSStefano Zampini PetscFunctionBegin; 35459566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 35463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3547e6e9a74fSStefano Zampini } 3548e6e9a74fSStefano Zampini 3549d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3550d71ae5a4SJacob Faibussowitsch { 3551e6e9a74fSStefano Zampini PetscFunctionBegin; 35529566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 35533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3554e6e9a74fSStefano Zampini } 3555e6e9a74fSStefano Zampini 3556d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3557d71ae5a4SJacob Faibussowitsch { 3558e6e9a74fSStefano Zampini PetscFunctionBegin; 35599566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 35603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 35619ae82921SPaul Mullowney } 35629ae82921SPaul Mullowney 3563d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3564d71ae5a4SJacob Faibussowitsch { 3565ca45077fSPaul Mullowney PetscFunctionBegin; 35669566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 35673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3568ca45077fSPaul Mullowney } 3569ca45077fSPaul Mullowney 3570d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3571d71ae5a4SJacob Faibussowitsch { 3572a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3573a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3574a0e72f99SJunchao Zhang } 3575a0e72f99SJunchao Zhang 3576afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3577d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3578d71ae5a4SJacob Faibussowitsch { 35799ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3580aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 35819ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3582e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3583e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3584e6e9a74fSStefano Zampini PetscBool compressed; 3585afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3586afb2bd1cSJunchao Zhang PetscInt nx, ny; 3587afb2bd1cSJunchao Zhang #endif 35886e111a19SKarl Rupp 35899ae82921SPaul Mullowney PetscFunctionBegin; 359008401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3591cbc6b225SStefano Zampini if (!a->nz) { 3592995bce04SJacob Faibussowitsch if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3593995bce04SJacob Faibussowitsch else PetscCall(VecSeq_CUDA::Set(zz, 0)); 35943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3595e6e9a74fSStefano Zampini } 359634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 35979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3598e6e9a74fSStefano Zampini if (!trans) { 35999ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 36005f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3601e6e9a74fSStefano Zampini } else { 36021a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3603e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3604e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3605e6e9a74fSStefano Zampini } else { 36069566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3607e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3608e6e9a74fSStefano Zampini } 3609e6e9a74fSStefano Zampini } 3610e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3611e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3612213423ffSJunchao Zhang 3613e6e9a74fSStefano Zampini try { 36149566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 361569d47153SPierre Jolivet if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 36169566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3617afb2bd1cSJunchao Zhang 36189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3619e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3620afb2bd1cSJunchao Zhang /* z = A x + beta y. 3621afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3622afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3623afb2bd1cSJunchao Zhang */ 3624e6e9a74fSStefano Zampini xptr = xarray; 3625afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3626213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3627afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3628afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3629afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3630afb2bd1cSJunchao Zhang */ 3631afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3632afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3633fe5544b9SJunchao Zhang nx = mat->num_cols; // since y = Ax 3634afb2bd1cSJunchao Zhang ny = mat->num_rows; 3635afb2bd1cSJunchao Zhang } 3636afb2bd1cSJunchao Zhang #endif 3637e6e9a74fSStefano Zampini } else { 3638afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3639afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3640afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3641afb2bd1cSJunchao Zhang */ 3642afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3643e6e9a74fSStefano Zampini dptr = zarray; 3644e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3645afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3646e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3647d0967f54SJacob Faibussowitsch 3648d0967f54SJacob Faibussowitsch thrust::for_each( 3649d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3650d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3651d0967f54SJacob Faibussowitsch #endif 3652d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 36539371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3654e6e9a74fSStefano Zampini } 3655afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3656afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3657afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3658fe5544b9SJunchao Zhang nx = mat->num_rows; // since y = A^T x 3659afb2bd1cSJunchao Zhang ny = mat->num_cols; 3660afb2bd1cSJunchao Zhang } 3661afb2bd1cSJunchao Zhang #endif 3662e6e9a74fSStefano Zampini } 36639ae82921SPaul Mullowney 3664afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3665aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3666afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3667fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3668fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3669fe5544b9SJunchao Zhang #else 3670fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3671fe5544b9SJunchao Zhang #endif 3672fe5544b9SJunchao Zhang 36735f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3674fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3675fe5544b9SJunchao Zhang if (!matDescr) { 3676fe5544b9SJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3677fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3678fe5544b9SJunchao Zhang } 3679fe5544b9SJunchao Zhang #endif 3680fe5544b9SJunchao Zhang 3681afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 36829566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 36839566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 36849371c9d4SSatish Balay PetscCallCUSPARSE( 3685fe5544b9SJunchao Zhang cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 36869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3687fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3688fe5544b9SJunchao Zhang PetscCallCUSPARSE( 3689fe5544b9SJunchao Zhang cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3690fe5544b9SJunchao Zhang #endif 3691afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3692afb2bd1cSJunchao Zhang } else { 3693afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 36949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 36959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3696afb2bd1cSJunchao Zhang } 3697afb2bd1cSJunchao Zhang 3698fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3699afb2bd1cSJunchao Zhang #else 37007656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 37019371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3702afb2bd1cSJunchao Zhang #endif 3703aa372e3fSPaul Mullowney } else { 3704213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3705afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3706afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3707afb2bd1cSJunchao Zhang #else 3708301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 37099371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3710afb2bd1cSJunchao Zhang #endif 3711a65300a6SPaul Mullowney } 3712aa372e3fSPaul Mullowney } 37139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3714aa372e3fSPaul Mullowney 3715e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3716213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3717213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3718995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3719e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3720995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 37217656d835SStefano Zampini } 3722213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3723995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Set(zz, 0)); 37247656d835SStefano Zampini } 37257656d835SStefano Zampini 3726213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3727213423ffSJunchao Zhang if (compressed) { 37289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3729da81f932SPierre Jolivet /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3730a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3731a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3732a0e72f99SJunchao Zhang */ 3733a0e72f99SJunchao Zhang #if 0 3734a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3735a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3736a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3737e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3738c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3739a0e72f99SJunchao Zhang #else 37406497c311SBarry Smith PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 37416497c311SBarry Smith ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3742a0e72f99SJunchao Zhang #endif 37439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3744e6e9a74fSStefano Zampini } 3745e6e9a74fSStefano Zampini } else { 3746995bce04SJacob Faibussowitsch if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3747e6e9a74fSStefano Zampini } 37489566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 37499566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 37509566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3751d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 3752d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3753d71ae5a4SJacob Faibussowitsch } 3754e6e9a74fSStefano Zampini if (yy) { 37559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3756e6e9a74fSStefano Zampini } else { 37579566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3758e6e9a74fSStefano Zampini } 37593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37609ae82921SPaul Mullowney } 37619ae82921SPaul Mullowney 3762d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3763d71ae5a4SJacob Faibussowitsch { 3764ca45077fSPaul Mullowney PetscFunctionBegin; 37659566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 37663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3767ca45077fSPaul Mullowney } 3768ca45077fSPaul Mullowney 3769d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3770d71ae5a4SJacob Faibussowitsch { 3771042217e8SBarry Smith PetscFunctionBegin; 37729566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 37733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37749ae82921SPaul Mullowney } 37759ae82921SPaul Mullowney 3776e057df02SPaul Mullowney /*@ 377711a5261eSBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 37782920cce0SJacob Faibussowitsch (the default parallel PETSc format). 37799ae82921SPaul Mullowney 3780d083f849SBarry Smith Collective 37819ae82921SPaul Mullowney 37829ae82921SPaul Mullowney Input Parameters: 378311a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 37849ae82921SPaul Mullowney . m - number of rows 37859ae82921SPaul Mullowney . n - number of columns 378620f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 378720f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 37889ae82921SPaul Mullowney 37899ae82921SPaul Mullowney Output Parameter: 37909ae82921SPaul Mullowney . A - the matrix 37919ae82921SPaul Mullowney 37922ef1f0ffSBarry Smith Level: intermediate 37932ef1f0ffSBarry Smith 37942ef1f0ffSBarry Smith Notes: 37952920cce0SJacob Faibussowitsch This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 37962920cce0SJacob Faibussowitsch calculations. For good matrix assembly performance the user should preallocate the matrix 37972920cce0SJacob Faibussowitsch storage by setting the parameter `nz` (or the array `nnz`). 37982920cce0SJacob Faibussowitsch 379911a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 38009ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 380111a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 38029ae82921SPaul Mullowney 380311a5261eSBarry Smith The AIJ format, also called 38042ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 38059ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 380620f4b53cSBarry Smith either one (as in Fortran) or zero. 38079ae82921SPaul Mullowney 38089ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 38092ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 381020f4b53cSBarry Smith allocation. 38119ae82921SPaul Mullowney 3812fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 38139ae82921SPaul Mullowney @*/ 3814d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3815d71ae5a4SJacob Faibussowitsch { 38169ae82921SPaul Mullowney PetscFunctionBegin; 38179566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 38189566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 38199566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 38209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 38213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38229ae82921SPaul Mullowney } 38239ae82921SPaul Mullowney 3824d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3825d71ae5a4SJacob Faibussowitsch { 38269ae82921SPaul Mullowney PetscFunctionBegin; 38279ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 38282c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 38299ae82921SPaul Mullowney } else { 38309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3831aa372e3fSPaul Mullowney } 38329566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 38339566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 38349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 38359566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 38369566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 38379566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 38389566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 38399566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 38409566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 38419566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 38429566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 38433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38449ae82921SPaul Mullowney } 38459ae82921SPaul Mullowney 3846ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 384795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3848d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3849d71ae5a4SJacob Faibussowitsch { 38509ff858a8SKarl Rupp PetscFunctionBegin; 38519566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 38529566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 38533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38549ff858a8SKarl Rupp } 38559ff858a8SKarl Rupp 3856d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3857d71ae5a4SJacob Faibussowitsch { 3858a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3859039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3860039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3861039c6fbaSStefano Zampini PetscScalar *ay; 3862039c6fbaSStefano Zampini const PetscScalar *ax; 3863039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3864e6e9a74fSStefano Zampini 386595639643SRichard Tran Mills PetscFunctionBegin; 3866a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3867a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3868039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 38699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 38709566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 38713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 387295639643SRichard Tran Mills } 3873039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 38749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 38759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 38765f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 38775f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3878039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3879039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3880039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3881039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3882039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3883ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3884039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3885039c6fbaSStefano Zampini } 3886d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3887d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3888039c6fbaSStefano Zampini 3889039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3890039c6fbaSStefano Zampini PetscScalar b = 1.0; 3891039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3892039c6fbaSStefano Zampini size_t bufferSize; 3893039c6fbaSStefano Zampini void *buffer; 3894039c6fbaSStefano Zampini #endif 3895039c6fbaSStefano Zampini 38969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3899039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 39009371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 39019371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 39029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 39039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39049371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 39059371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 39069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 39079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39089566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3909039c6fbaSStefano Zampini #else 39109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39119371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 39129371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 39139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 39149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3915039c6fbaSStefano Zampini #endif 39169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 39179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 39189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 39199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3920039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3921a587d139SMark cublasHandle_t cublasv2handle; 3922a587d139SMark PetscBLASInt one = 1, bnz = 1; 3923039c6fbaSStefano Zampini 39249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 39259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 39269566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 39279566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 39289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39299566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 39309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 39319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 39339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 39349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3935039c6fbaSStefano Zampini } else { 39369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 39379566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3938a587d139SMark } 39393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 394095639643SRichard Tran Mills } 394195639643SRichard Tran Mills 3942d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3943d71ae5a4SJacob Faibussowitsch { 394433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 394533c9ba73SStefano Zampini PetscScalar *ay; 394633c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 394733c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 394833c9ba73SStefano Zampini 394933c9ba73SStefano Zampini PetscFunctionBegin; 39509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 39519566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 39529566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 39539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39549566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 39559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 39569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 39589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 39593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 396033c9ba73SStefano Zampini } 396133c9ba73SStefano Zampini 3962d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3963d71ae5a4SJacob Faibussowitsch { 39647e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3965a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 39667e8381f9SStefano Zampini 39673fa6b06aSMark Adams PetscFunctionBegin; 39683fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 39693fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 39707e8381f9SStefano Zampini if (spptr->mat) { 39717e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 39727e8381f9SStefano Zampini if (matrix->values) { 39737e8381f9SStefano Zampini both = PETSC_TRUE; 39747e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39757e8381f9SStefano Zampini } 39767e8381f9SStefano Zampini } 39777e8381f9SStefano Zampini if (spptr->matTranspose) { 39787e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3979ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39807e8381f9SStefano Zampini } 39813fa6b06aSMark Adams } 39829566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 39839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 39847e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3985a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 39863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39873fa6b06aSMark Adams } 39883fa6b06aSMark Adams 3989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3990d71ae5a4SJacob Faibussowitsch { 3991a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3992a587d139SMark 3993a587d139SMark PetscFunctionBegin; 39949a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 39959a14fc28SStefano Zampini A->boundtocpu = flg; 39963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39979a14fc28SStefano Zampini } 3998a587d139SMark if (flg) { 39999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4000a587d139SMark 400133c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 4002a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 4003a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4004a587d139SMark A->ops->mult = MatMult_SeqAIJ; 4005a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 4006a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4007a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4008a587d139SMark A->ops->multhermitiantranspose = NULL; 4009a587d139SMark A->ops->multhermitiantransposeadd = NULL; 4010fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 40119566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 40129566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 40139566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 40149566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 40159566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 40169566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 40179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4018a587d139SMark } else { 401933c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 4020a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4021a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4022a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 4023a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4024a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4025a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4026a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4027a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4028fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 402967a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 403067a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 403167a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 403267a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 403367a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 403467a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 40357ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 40367ee59b9bSJunchao Zhang 40379566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 40389566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40399566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40409566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 40419566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 40429566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4043a587d139SMark } 4044a587d139SMark A->boundtocpu = flg; 4045ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 4046ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 4047ea500dcfSRichard Tran Mills } else { 4048ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 4049ea500dcfSRichard Tran Mills } 40503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4051a587d139SMark } 4052a587d139SMark 40538eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4054d71ae5a4SJacob Faibussowitsch { 405549735bf3SStefano Zampini Mat B; 40569ae82921SPaul Mullowney 40579ae82921SPaul Mullowney PetscFunctionBegin; 40589566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 405949735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 40609566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 406149735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 40629566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 406349735bf3SStefano Zampini } 406449735bf3SStefano Zampini B = *newmat; 406549735bf3SStefano Zampini 40669566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 40679566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 406834136279SStefano Zampini 406949735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 40709ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4071e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 40729566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 40751a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4076d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4077b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4078a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4079a435da06SStefano Zampini #else 4080d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4081a435da06SStefano Zampini #endif 4082d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4083d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4084d8132acaSStefano Zampini #endif 40851a2c6b5cSJunchao Zhang B->spptr = spptr; 40869ae82921SPaul Mullowney } else { 4087e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4088e6e9a74fSStefano Zampini 40899566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4092e6e9a74fSStefano Zampini B->spptr = spptr; 40939ae82921SPaul Mullowney } 4094e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 409549735bf3SStefano Zampini } 4096693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 40979ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 40981a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 40999ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 410095639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4101693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 41022205254eSKarl Rupp 41039566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 41049566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 41059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4106ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 41079566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4108ae48a8d0SStefano Zampini #endif 41099566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 41103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41119ae82921SPaul Mullowney } 41129ae82921SPaul Mullowney 4113d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4114d71ae5a4SJacob Faibussowitsch { 411502fe1965SBarry Smith PetscFunctionBegin; 41169566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 41179566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 41183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 411902fe1965SBarry Smith } 412002fe1965SBarry Smith 41213ca39a21SBarry Smith /*MC 4122e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4123e057df02SPaul Mullowney 412415229ffcSPierre Jolivet A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 412511a5261eSBarry Smith CSR, ELL, or Hybrid format. 412611a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4127e057df02SPaul Mullowney 4128e057df02SPaul Mullowney Options Database Keys: 412911a5261eSBarry Smith + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 41302ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 41312ef1f0ffSBarry Smith Other options include ell (ellpack) or hyb (hybrid). 41322ef1f0ffSBarry Smith . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 41332ef1f0ffSBarry Smith - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4134e057df02SPaul Mullowney 4135e057df02SPaul Mullowney Level: beginner 4136e057df02SPaul Mullowney 41371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4138e057df02SPaul Mullowney M*/ 41397f756511SDominic Meiser 4140d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4141d71ae5a4SJacob Faibussowitsch { 414242c9c57cSBarry Smith PetscFunctionBegin; 41439566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 41449566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 41459566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 41469566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 41473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 414842c9c57cSBarry Smith } 414929b38603SBarry Smith 41502c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4151d71ae5a4SJacob Faibussowitsch { 41522c4ab24aSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4153cbc6b225SStefano Zampini 4154cbc6b225SStefano Zampini PetscFunctionBegin; 41552c4ab24aSJunchao Zhang if (cusp) { 41562c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 41572c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 41582c4ab24aSJunchao Zhang delete cusp->workVector; 41592c4ab24aSJunchao Zhang delete cusp->rowoffsets_gpu; 41602c4ab24aSJunchao Zhang delete cusp->csr2csc_i; 41612c4ab24aSJunchao Zhang delete cusp->coords; 41622c4ab24aSJunchao Zhang if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 41632c4ab24aSJunchao Zhang PetscCall(PetscFree(mat->spptr)); 41647f756511SDominic Meiser } 41653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41667f756511SDominic Meiser } 41677f756511SDominic Meiser 4168d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4169d71ae5a4SJacob Faibussowitsch { 41707f756511SDominic Meiser PetscFunctionBegin; 41717f756511SDominic Meiser if (*mat) { 41727f756511SDominic Meiser delete (*mat)->values; 41737f756511SDominic Meiser delete (*mat)->column_indices; 41747f756511SDominic Meiser delete (*mat)->row_offsets; 41757f756511SDominic Meiser delete *mat; 41767f756511SDominic Meiser *mat = 0; 41777f756511SDominic Meiser } 41783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41797f756511SDominic Meiser } 41807f756511SDominic Meiser 4181b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4182d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4183d71ae5a4SJacob Faibussowitsch { 41847f756511SDominic Meiser PetscFunctionBegin; 41857f756511SDominic Meiser if (*trifactor) { 41869566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4187261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 41889566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 41899566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 41909566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4191afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 41929566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4193afb2bd1cSJunchao Zhang #endif 41949566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 41957f756511SDominic Meiser } 41963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41977f756511SDominic Meiser } 4198d460d7bfSJunchao Zhang #endif 41997f756511SDominic Meiser 4200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4201d71ae5a4SJacob Faibussowitsch { 42027f756511SDominic Meiser CsrMatrix *mat; 42037f756511SDominic Meiser 42047f756511SDominic Meiser PetscFunctionBegin; 42057f756511SDominic Meiser if (*matstruct) { 42067f756511SDominic Meiser if ((*matstruct)->mat) { 42077f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4208afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4209afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4210afb2bd1cSJunchao Zhang #else 42117f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 42129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4213afb2bd1cSJunchao Zhang #endif 42147f756511SDominic Meiser } else { 42157f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 42163ba16761SJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&mat)); 42177f756511SDominic Meiser } 42187f756511SDominic Meiser } 42199566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 42207f756511SDominic Meiser delete (*matstruct)->cprowIndices; 42219566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 42229566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 42239566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4224afb2bd1cSJunchao Zhang 4225afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4226afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 42279566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4228fe5544b9SJunchao Zhang 4229afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 4230afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 42319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 42329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 42339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4234fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4235fe5544b9SJunchao Zhang if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4236fe5544b9SJunchao Zhang if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4237fe5544b9SJunchao Zhang #endif 4238afb2bd1cSJunchao Zhang } 4239afb2bd1cSJunchao Zhang } 4240afb2bd1cSJunchao Zhang #endif 42417f756511SDominic Meiser delete *matstruct; 42427e8381f9SStefano Zampini *matstruct = NULL; 42437f756511SDominic Meiser } 42443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42457f756511SDominic Meiser } 42467f756511SDominic Meiser 4247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4248d71ae5a4SJacob Faibussowitsch { 4249da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4250da112707SJunchao Zhang 42517f756511SDominic Meiser PetscFunctionBegin; 4252da112707SJunchao Zhang if (fs) { 4253b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4254da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4255da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4256da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4257da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4258d460d7bfSJunchao Zhang delete fs->workVector; 4259d460d7bfSJunchao Zhang fs->workVector = NULL; 4260d460d7bfSJunchao Zhang #endif 4261da112707SJunchao Zhang delete fs->rpermIndices; 4262da112707SJunchao Zhang delete fs->cpermIndices; 4263da112707SJunchao Zhang fs->rpermIndices = NULL; 4264da112707SJunchao Zhang fs->cpermIndices = NULL; 4265da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4266b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4267da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4268da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 426930807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 427030807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4271da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4272d460d7bfSJunchao Zhang PetscCallCUDA(cudaFree(fs->diag)); 4273da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4274da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 427512ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4276da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4277da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 427812ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4279da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4280da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4281da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4282da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4283da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4284da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4285da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4286da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4287da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4288da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4289da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4290da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4291d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrRowPtr_h)); 4292d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrVal_h)); 4293d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->diag_h)); 429412ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 429512ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4296da112707SJunchao Zhang #endif 4297ccdfe979SStefano Zampini } 42983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4299ccdfe979SStefano Zampini } 4300ccdfe979SStefano Zampini 4301d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4302d71ae5a4SJacob Faibussowitsch { 4303ccdfe979SStefano Zampini PetscFunctionBegin; 4304ccdfe979SStefano Zampini if (*trifactors) { 43059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4306f0173cd6SStefano Zampini PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 43079566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 43087f756511SDominic Meiser } 43093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43107f756511SDominic Meiser } 43117e8381f9SStefano Zampini 43129371c9d4SSatish Balay struct IJCompare { 4313d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4314d71ae5a4SJacob Faibussowitsch { 43150b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 43160b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 43177e8381f9SStefano Zampini return false; 43187e8381f9SStefano Zampini } 43197e8381f9SStefano Zampini }; 43207e8381f9SStefano Zampini 432166976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4322d71ae5a4SJacob Faibussowitsch { 4323a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4324a49f1ed0SStefano Zampini 4325a49f1ed0SStefano Zampini PetscFunctionBegin; 4326a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 43273ba16761SJacob Faibussowitsch if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4328a49f1ed0SStefano Zampini if (destroy) { 43299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4330a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4331a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4332a49f1ed0SStefano Zampini } 43331a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 43343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4335a49f1ed0SStefano Zampini } 4336a49f1ed0SStefano Zampini 433749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4338d71ae5a4SJacob Faibussowitsch { 433949abdd8aSBarry Smith MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 43404d86920dSPierre Jolivet 43417e8381f9SStefano Zampini PetscFunctionBegin; 43422c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->perm)); 43432c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->jmap)); 43442c4ab24aSJunchao Zhang PetscCall(PetscFree(coo)); 43453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43467e8381f9SStefano Zampini } 4347ed502f03SStefano Zampini 434866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4349d71ae5a4SJacob Faibussowitsch { 43502c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE; 43512c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST; 43522c4ab24aSJunchao Zhang PetscInt *i, *j; 435303e76207SPierre Jolivet PetscContainer container_h; 43542c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4355219fbbafSJunchao Zhang 4356219fbbafSJunchao Zhang PetscFunctionBegin; 43579566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 43582c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) { 43592c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE; 43602c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 43612c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43622c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43632c4ab24aSJunchao Zhang } else { 43642c4ab24aSJunchao Zhang i = coo_i; 43652c4ab24aSJunchao Zhang j = coo_j; 4366219fbbafSJunchao Zhang } 4367219fbbafSJunchao Zhang 43682c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 43692c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j)); 4370cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 43712c4ab24aSJunchao Zhang // Create the GPU memory 43729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 43732c4ab24aSJunchao Zhang 43742c4ab24aSJunchao Zhang // Copy the COO struct to device 43752c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 43762c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 43772c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d)); 43782c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 43792c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 43802c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43812c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 43822c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43832c4ab24aSJunchao Zhang 43842c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 438503e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 43863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4387219fbbafSJunchao Zhang } 4388219fbbafSJunchao Zhang 4389d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4390d71ae5a4SJacob Faibussowitsch { 4391219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4392219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4393b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4394b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4395b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4396b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4397b6c38306SJunchao Zhang } 4398219fbbafSJunchao Zhang } 4399219fbbafSJunchao Zhang 440066976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4401d71ae5a4SJacob Faibussowitsch { 4402219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4403219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4404219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4405219fbbafSJunchao Zhang PetscMemType memtype; 4406219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4407219fbbafSJunchao Zhang PetscScalar *Aa; 44082c4ab24aSJunchao Zhang PetscContainer container; 44092c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo; 4410219fbbafSJunchao Zhang 4411219fbbafSJunchao Zhang PetscFunctionBegin; 44122c4ab24aSJunchao Zhang if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 44132c4ab24aSJunchao Zhang 44142c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 44152c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 44162c4ab24aSJunchao Zhang 44179566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4418219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 44192c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 44202c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4421219fbbafSJunchao Zhang } 4422219fbbafSJunchao Zhang 44239566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 44249566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4425219fbbafSJunchao Zhang 442608bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 4427cbc6b225SStefano Zampini if (Annz) { 44286497c311SBarry Smith MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 44299566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4430cbc6b225SStefano Zampini } 443108bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 4432219fbbafSJunchao Zhang 44339566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 44349566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4435219fbbafSJunchao Zhang 44369566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 44373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4438219fbbafSJunchao Zhang } 4439219fbbafSJunchao Zhang 44405b7e41feSStefano Zampini /*@C 44412ef1f0ffSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 44425b7e41feSStefano Zampini 44432ef1f0ffSBarry Smith Not Collective 44445b7e41feSStefano Zampini 44455b7e41feSStefano Zampini Input Parameters: 44465b7e41feSStefano Zampini + A - the matrix 444711a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 44485b7e41feSStefano Zampini 44495b7e41feSStefano Zampini Output Parameters: 445020f4b53cSBarry Smith + i - the CSR row pointers 445120f4b53cSBarry Smith - j - the CSR column indices 44525b7e41feSStefano Zampini 44535b7e41feSStefano Zampini Level: developer 44545b7e41feSStefano Zampini 445511a5261eSBarry Smith Note: 44565b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 44575b7e41feSStefano Zampini 44581cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 44595b7e41feSStefano Zampini @*/ 4460d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4461d71ae5a4SJacob Faibussowitsch { 44625f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 44635f101d05SStefano Zampini CsrMatrix *csr; 44645f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 44655f101d05SStefano Zampini 44665f101d05SStefano Zampini PetscFunctionBegin; 44675f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44683ba16761SJacob Faibussowitsch if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 44695f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4470aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 447228b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 44735f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 44745f101d05SStefano Zampini if (i) { 44755f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 44765f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 44775f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 44785f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 44799566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 44805f101d05SStefano Zampini } 44815f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 44825f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 44835f101d05SStefano Zampini } 44845f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 44853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 44865f101d05SStefano Zampini } 44875f101d05SStefano Zampini 44885b7e41feSStefano Zampini /*@C 44892ef1f0ffSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 44905b7e41feSStefano Zampini 44912ef1f0ffSBarry Smith Not Collective 44925b7e41feSStefano Zampini 44935b7e41feSStefano Zampini Input Parameters: 44945b7e41feSStefano Zampini + A - the matrix 44952ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 449620f4b53cSBarry Smith . i - the CSR row pointers 449720f4b53cSBarry Smith - j - the CSR column indices 44985b7e41feSStefano Zampini 44995b7e41feSStefano Zampini Level: developer 45005b7e41feSStefano Zampini 45011cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 45025b7e41feSStefano Zampini @*/ 450320f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4504d71ae5a4SJacob Faibussowitsch { 45055f101d05SStefano Zampini PetscFunctionBegin; 45065f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45075f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 45085f101d05SStefano Zampini if (i) *i = NULL; 45095f101d05SStefano Zampini if (j) *j = NULL; 451020f4b53cSBarry Smith (void)compressed; 45113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45125f101d05SStefano Zampini } 45135f101d05SStefano Zampini 45145b7e41feSStefano Zampini /*@C 451511a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45165b7e41feSStefano Zampini 45175b7e41feSStefano Zampini Not Collective 45185b7e41feSStefano Zampini 45195b7e41feSStefano Zampini Input Parameter: 452011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45215b7e41feSStefano Zampini 45225b7e41feSStefano Zampini Output Parameter: 45235b7e41feSStefano Zampini . a - pointer to the device data 45245b7e41feSStefano Zampini 45255b7e41feSStefano Zampini Level: developer 45265b7e41feSStefano Zampini 452711a5261eSBarry Smith Note: 452811a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 45295b7e41feSStefano Zampini 45301cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 45315b7e41feSStefano Zampini @*/ 4532d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4533d71ae5a4SJacob Faibussowitsch { 4534ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4535ed502f03SStefano Zampini CsrMatrix *csr; 4536ed502f03SStefano Zampini 4537ed502f03SStefano Zampini PetscFunctionBegin; 4538ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45394f572ea9SToby Isaac PetscAssertPointer(a, 2); 4540ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4541aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 454328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4544ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 454528b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4546ed502f03SStefano Zampini *a = csr->values->data().get(); 45473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4548ed502f03SStefano Zampini } 4549ed502f03SStefano Zampini 45505b7e41feSStefano Zampini /*@C 455111a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 45525b7e41feSStefano Zampini 45535b7e41feSStefano Zampini Not Collective 45545b7e41feSStefano Zampini 45552ef1f0ffSBarry Smith Input Parameters: 45562ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 45572ef1f0ffSBarry Smith - a - pointer to the device data 45585b7e41feSStefano Zampini 45595b7e41feSStefano Zampini Level: developer 45605b7e41feSStefano Zampini 45611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 45625b7e41feSStefano Zampini @*/ 4563d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4564d71ae5a4SJacob Faibussowitsch { 4565ed502f03SStefano Zampini PetscFunctionBegin; 4566ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45674f572ea9SToby Isaac PetscAssertPointer(a, 2); 4568ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4569ed502f03SStefano Zampini *a = NULL; 45703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4571ed502f03SStefano Zampini } 4572ed502f03SStefano Zampini 45735b7e41feSStefano Zampini /*@C 457411a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45755b7e41feSStefano Zampini 45765b7e41feSStefano Zampini Not Collective 45775b7e41feSStefano Zampini 45785b7e41feSStefano Zampini Input Parameter: 457911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45805b7e41feSStefano Zampini 45815b7e41feSStefano Zampini Output Parameter: 45825b7e41feSStefano Zampini . a - pointer to the device data 45835b7e41feSStefano Zampini 45845b7e41feSStefano Zampini Level: developer 45855b7e41feSStefano Zampini 458611a5261eSBarry Smith Note: 458711a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 45885b7e41feSStefano Zampini 45891cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 45905b7e41feSStefano Zampini @*/ 4591d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4592d71ae5a4SJacob Faibussowitsch { 4593039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4594039c6fbaSStefano Zampini CsrMatrix *csr; 4595039c6fbaSStefano Zampini 4596039c6fbaSStefano Zampini PetscFunctionBegin; 4597039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45984f572ea9SToby Isaac PetscAssertPointer(a, 2); 4599039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4600aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 46019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 460228b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4603039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 460428b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4605039c6fbaSStefano Zampini *a = csr->values->data().get(); 4606039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 46083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4609039c6fbaSStefano Zampini } 46105b7e41feSStefano Zampini /*@C 461111a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4612039c6fbaSStefano Zampini 46135b7e41feSStefano Zampini Not Collective 46145b7e41feSStefano Zampini 46152ef1f0ffSBarry Smith Input Parameters: 46162ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 46172ef1f0ffSBarry Smith - a - pointer to the device data 46185b7e41feSStefano Zampini 46195b7e41feSStefano Zampini Level: developer 46205b7e41feSStefano Zampini 46211cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 46225b7e41feSStefano Zampini @*/ 4623d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4624d71ae5a4SJacob Faibussowitsch { 4625039c6fbaSStefano Zampini PetscFunctionBegin; 4626039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46274f572ea9SToby Isaac PetscAssertPointer(a, 2); 4628039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46309566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4631039c6fbaSStefano Zampini *a = NULL; 46323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4633039c6fbaSStefano Zampini } 4634039c6fbaSStefano Zampini 46355b7e41feSStefano Zampini /*@C 463611a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 46375b7e41feSStefano Zampini 46385b7e41feSStefano Zampini Not Collective 46395b7e41feSStefano Zampini 46405b7e41feSStefano Zampini Input Parameter: 464111a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46425b7e41feSStefano Zampini 46435b7e41feSStefano Zampini Output Parameter: 46445b7e41feSStefano Zampini . a - pointer to the device data 46455b7e41feSStefano Zampini 46465b7e41feSStefano Zampini Level: developer 46475b7e41feSStefano Zampini 464811a5261eSBarry Smith Note: 464911a5261eSBarry Smith Does not trigger host-device copies and flags data validity on the GPU 46505b7e41feSStefano Zampini 46511cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 46525b7e41feSStefano Zampini @*/ 4653d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4654d71ae5a4SJacob Faibussowitsch { 4655ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4656ed502f03SStefano Zampini CsrMatrix *csr; 4657ed502f03SStefano Zampini 4658ed502f03SStefano Zampini PetscFunctionBegin; 4659ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46604f572ea9SToby Isaac PetscAssertPointer(a, 2); 4661ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4662aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 466328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4664ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 466528b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4666ed502f03SStefano Zampini *a = csr->values->data().get(); 4667039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 46693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4670ed502f03SStefano Zampini } 4671ed502f03SStefano Zampini 46725b7e41feSStefano Zampini /*@C 467311a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 46745b7e41feSStefano Zampini 46755b7e41feSStefano Zampini Not Collective 46765b7e41feSStefano Zampini 46772ef1f0ffSBarry Smith Input Parameters: 46782ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 46792ef1f0ffSBarry Smith - a - pointer to the device data 46805b7e41feSStefano Zampini 46815b7e41feSStefano Zampini Level: developer 46825b7e41feSStefano Zampini 46831cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 46845b7e41feSStefano Zampini @*/ 4685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4686d71ae5a4SJacob Faibussowitsch { 4687ed502f03SStefano Zampini PetscFunctionBegin; 4688ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46894f572ea9SToby Isaac PetscAssertPointer(a, 2); 4690ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46929566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4693ed502f03SStefano Zampini *a = NULL; 46943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4695ed502f03SStefano Zampini } 4696ed502f03SStefano Zampini 46979371c9d4SSatish Balay struct IJCompare4 { 4698d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4699d71ae5a4SJacob Faibussowitsch { 47000b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 47010b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4702ed502f03SStefano Zampini return false; 4703ed502f03SStefano Zampini } 4704ed502f03SStefano Zampini }; 4705ed502f03SStefano Zampini 47069371c9d4SSatish Balay struct Shift { 4707ed502f03SStefano Zampini int _shift; 4708ed502f03SStefano Zampini 4709ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 47109371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4711ed502f03SStefano Zampini }; 4712ed502f03SStefano Zampini 471321afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4714d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4715d71ae5a4SJacob Faibussowitsch { 4716ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4717ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4718ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4719ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4720ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4721ed502f03SStefano Zampini cusparseStatus_t stat; 4722ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4723ed502f03SStefano Zampini 4724ed502f03SStefano Zampini PetscFunctionBegin; 4725ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4726ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 47274f572ea9SToby Isaac PetscAssertPointer(C, 4); 4728ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4729ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 47305f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 473108401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4732aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4733aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4734ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4735ed502f03SStefano Zampini m = A->rmap->n; 4736ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 47379566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 47389566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 47399566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4740ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4741ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4742ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4743ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4744ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4745ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4746ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4747ed502f03SStefano Zampini c->compressedrow.i = NULL; 4748ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4749ed502f03SStefano Zampini Ccusp->workVector = NULL; 4750ed502f03SStefano Zampini Ccusp->nrows = m; 4751ed502f03SStefano Zampini Ccusp->mat = Cmat; 4752ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4753ed502f03SStefano Zampini Ccsr->num_rows = m; 4754ed502f03SStefano Zampini Ccsr->num_cols = n; 47559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 47569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 47579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4758f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4759f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4760f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 47619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 476628b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 476728b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4768ed502f03SStefano Zampini 4769ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4770ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4771ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4772ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4773ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4774ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4775ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4776ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4777ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 47782c4ab24aSJunchao Zhang Ccusp->coords = new THRUSTINTARRAY(c->nz); 4779ed502f03SStefano Zampini if (c->nz) { 47802ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 47812ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 47822ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 47832ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 47842ed87e7eSStefano Zampini 4785ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4786ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4787ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4788ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 47899566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4790ed502f03SStefano Zampini } 47912ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 47922ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4793ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4794ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4795ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4796ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 47979566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4798ed502f03SStefano Zampini } 47992ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 48002ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 48019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 48029371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48039371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48049371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48059371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48062ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 48072ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 48082ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 48098909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4810ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4811ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 48128909a122SStefano Zampini #else 48138909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 48148909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 48158909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 48168909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 48178909a122SStefano Zampini #endif 48182ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 48192ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 48202ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 48212ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 48222ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 48232ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 48242c4ab24aSJunchao Zhang auto p1 = Ccusp->coords->begin(); 48252c4ab24aSJunchao Zhang auto p2 = Ccusp->coords->begin(); 4826ed502f03SStefano Zampini thrust::advance(p2, Annz); 4827792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 48288909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 48298909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 48308909a122SStefano Zampini #endif 48312ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 48322ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 48332ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4834792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 48352ed87e7eSStefano Zampini #else 48362ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 4837792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4838792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 48392ed87e7eSStefano Zampini #endif 48409371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48419371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 48432ed87e7eSStefano Zampini delete wPerm; 48442ed87e7eSStefano Zampini delete Acoo; 48452ed87e7eSStefano Zampini delete Bcoo; 48462ed87e7eSStefano Zampini delete Ccoo; 4847ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48489371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48499371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4850ed502f03SStefano Zampini #endif 48511a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 48529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 48539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4854ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4855ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4856ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4857ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4858ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4859ed502f03SStefano Zampini 48601a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 48611a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4862a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4863ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4864ed502f03SStefano Zampini CmatT->mat = CcsrT; 4865ed502f03SStefano Zampini CcsrT->num_rows = n; 4866ed502f03SStefano Zampini CcsrT->num_cols = m; 4867ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4868ed502f03SStefano Zampini 4869ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4870ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4871ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4872ed502f03SStefano Zampini 48739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4874ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4875ed502f03SStefano Zampini if (AT) { 4876ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4877ed502f03SStefano Zampini thrust::advance(rT, -1); 4878ed502f03SStefano Zampini } 4879ed502f03SStefano Zampini if (BT) { 4880ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4881ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4882ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4883ed502f03SStefano Zampini } 4884ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4885ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4886ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4887ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4888ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4889ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4891ed502f03SStefano Zampini 48929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 48939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 48949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4895f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4896f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4897f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 48989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 49009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4901ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 49029371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 49039371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4904ed502f03SStefano Zampini #endif 4905ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4906ed502f03SStefano Zampini } 4907ed502f03SStefano Zampini } 4908ed502f03SStefano Zampini 4909ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 49109f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 49119f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4912ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 49137de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4914ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4915ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4916ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4917ed502f03SStefano Zampini jj = *Ccsr->column_indices; 49189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4920ed502f03SStefano Zampini } else { 49219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4923ed502f03SStefano Zampini } 49249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 49259566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 49269566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4927ed502f03SStefano Zampini c->maxnz = c->nz; 4928ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4929ed502f03SStefano Zampini c->rmax = 0; 4930ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4931ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4932ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4933ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4934ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4935ed502f03SStefano Zampini } 49369566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 49379566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4938ed502f03SStefano Zampini (*C)->nonzerostate++; 49399566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 49409566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4941ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4942ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4943ed502f03SStefano Zampini } else { 494408401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4945ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4946ed502f03SStefano Zampini if (c->nz) { 4947ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 49482c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4949aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 495008401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 49519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 49529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 49535f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 49545f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4955ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4956ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4957ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4958aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4959aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4960aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4961aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 49622c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 49632c4ab24aSJunchao Zhang auto pmid = Ccusp->coords->begin(); 4964ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 49659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 49662c4ab24aSJunchao Zhang auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 49679371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4968ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 49699371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 49702c4ab24aSJunchao Zhang auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4971ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 49729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 49731a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 49745f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4975ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4976ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4977ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4978ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4979ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4980ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4981ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 49821a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4983ed502f03SStefano Zampini } 49849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4985ed502f03SStefano Zampini } 4986ed502f03SStefano Zampini } 49879566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4988ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4989ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4990ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 49913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4992ed502f03SStefano Zampini } 4993c215019aSStefano Zampini 4994d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4995d71ae5a4SJacob Faibussowitsch { 4996c215019aSStefano Zampini bool dmem; 4997c215019aSStefano Zampini const PetscScalar *av; 4998c215019aSStefano Zampini 4999c215019aSStefano Zampini PetscFunctionBegin; 5000c215019aSStefano Zampini dmem = isCudaMem(v); 50019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5002c215019aSStefano Zampini if (n && idx) { 5003c215019aSStefano Zampini THRUSTINTARRAY widx(n); 5004c215019aSStefano Zampini widx.assign(idx, idx + n); 50059566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5006c215019aSStefano Zampini 5007c215019aSStefano Zampini THRUSTARRAY *w = NULL; 5008c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 5009c215019aSStefano Zampini if (dmem) { 5010c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 5011c215019aSStefano Zampini } else { 5012c215019aSStefano Zampini w = new THRUSTARRAY(n); 5013c215019aSStefano Zampini dv = w->data(); 5014c215019aSStefano Zampini } 5015c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5016c215019aSStefano Zampini 5017c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5018c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5019c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 502048a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5021c215019aSStefano Zampini delete w; 5022c215019aSStefano Zampini } else { 50239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5024c215019aSStefano Zampini } 50259566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 50269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 50273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5028c215019aSStefano Zampini } 5029b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 5030