19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 69ae82921SPaul Mullowney 73d13b8fdSMatthew G. Knepley #include <petscconf.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 11af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 129ae82921SPaul Mullowney #undef VecType 133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 16d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 18a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 19d0967f54SJacob Faibussowitsch #endif 20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 21a2cee5feSJed Brown #include <thrust/remove.h> 22a2cee5feSJed Brown #include <thrust/sort.h> 23a2cee5feSJed Brown #include <thrust/unique.h> 24e8d2b73aSMark Adams 25b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 29afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 30afb2bd1cSJunchao Zhang 31afb2bd1cSJunchao Zhang typedef enum { 32afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 35afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 36afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 37afb2bd1cSJunchao Zhang 38afb2bd1cSJunchao Zhang typedef enum { 39afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 40afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 41afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 42afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 43afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 47afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 48afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 49afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 50afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 51afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 52afb2bd1cSJunchao Zhang 53afb2bd1cSJunchao Zhang typedef enum { 5435cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 5535cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 56afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 57afb2bd1cSJunchao Zhang */ 58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 61afb2bd1cSJunchao Zhang #endif 629ae82921SPaul Mullowney 63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 67b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 70d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 716fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 72d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 73d460d7bfSJunchao Zhang #endif 74ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 849ae82921SPaul Mullowney 857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 882c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 897f756511SDominic Meiser 9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9257181aedSStefano Zampini 93c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 94e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 95219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 96c215019aSStefano Zampini 97d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 98d71ae5a4SJacob Faibussowitsch { 99aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1006e111a19SKarl Rupp 101ca45077fSPaul Mullowney PetscFunctionBegin; 102ca45077fSPaul Mullowney switch (op) { 103d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT: 104d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 105d71ae5a4SJacob Faibussowitsch break; 106d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_ALL: 107d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 108d71ae5a4SJacob Faibussowitsch break; 109d71ae5a4SJacob Faibussowitsch default: 110d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 111ca45077fSPaul Mullowney } 1123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 113ca45077fSPaul Mullowney } 1149ae82921SPaul Mullowney 115e057df02SPaul Mullowney /*@ 11611a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 11711a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 11811a5261eSBarry Smith 119e057df02SPaul Mullowney Not Collective 120e057df02SPaul Mullowney 121e057df02SPaul Mullowney Input Parameters: 12211a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 1232ef1f0ffSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 1242ef1f0ffSBarry Smith `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 12511a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 126e057df02SPaul Mullowney 127e057df02SPaul Mullowney Level: intermediate 128e057df02SPaul Mullowney 129fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 130e057df02SPaul Mullowney @*/ 131d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 132d71ae5a4SJacob Faibussowitsch { 133e057df02SPaul Mullowney PetscFunctionBegin; 134e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 135cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 1363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137e057df02SPaul Mullowney } 138e057df02SPaul Mullowney 139d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 140d71ae5a4SJacob Faibussowitsch { 141365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 142365b711fSMark Adams 143365b711fSMark Adams PetscFunctionBegin; 144365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 1453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 146365b711fSMark Adams } 147365b711fSMark Adams 148365b711fSMark Adams /*@ 14911a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 150365b711fSMark Adams 151365b711fSMark Adams Input Parameters: 15211a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 15311a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 154365b711fSMark Adams 1552ef1f0ffSBarry Smith Level: intermediate 156365b711fSMark Adams 15711a5261eSBarry Smith Note: 158365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 159365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 160365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 161365b711fSMark Adams 1621cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 163365b711fSMark Adams @*/ 164d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 165d71ae5a4SJacob Faibussowitsch { 166365b711fSMark Adams PetscFunctionBegin; 167365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 168cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 1693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 170365b711fSMark Adams } 171365b711fSMark Adams 17266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 173d71ae5a4SJacob Faibussowitsch { 174e6e9a74fSStefano Zampini PetscFunctionBegin; 1751a2c6b5cSJunchao Zhang switch (op) { 1761a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1771a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1789566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1791a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1801a2c6b5cSJunchao Zhang break; 181d71ae5a4SJacob Faibussowitsch default: 182d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 183d71ae5a4SJacob Faibussowitsch break; 184e6e9a74fSStefano Zampini } 1853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 186e6e9a74fSStefano Zampini } 187e6e9a74fSStefano Zampini 188ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 189d71ae5a4SJacob Faibussowitsch { 190e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1919ae82921SPaul Mullowney PetscBool flg; 192a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1936e111a19SKarl Rupp 1949ae82921SPaul Mullowney PetscFunctionBegin; 195d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 1969ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1979371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 1989566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 199afb2bd1cSJunchao Zhang 2009371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2019566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2029566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2039566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2059371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 206afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 207b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 208aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 209a435da06SStefano Zampini #else 210aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 211a435da06SStefano Zampini #endif 2129371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 213aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 214afb2bd1cSJunchao Zhang 2159371c9d4SSatish Balay PetscCall( 2169371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 217aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 218afb2bd1cSJunchao Zhang #endif 2194c87dfd4SPaul Mullowney } 220d0609cedSBarry Smith PetscOptionsHeadEnd(); 2213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2229ae82921SPaul Mullowney } 2239ae82921SPaul Mullowney 224b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 225d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 226d460d7bfSJunchao Zhang { 227d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 228d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 229d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 230d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 231d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 232d460d7bfSJunchao Zhang PetscInt *Mi, *Mj, Mnz; 233d460d7bfSJunchao Zhang PetscScalar *Ma; 234d460d7bfSJunchao Zhang 235d460d7bfSJunchao Zhang PetscFunctionBegin; 236d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 237d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 238d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 239d460d7bfSJunchao Zhang Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 240d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &Mi)); 241d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 242d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 243d460d7bfSJunchao Zhang Mi[0] = 0; 244d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 245d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 246d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 247d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 248d460d7bfSJunchao Zhang Mj[Mi[i] + llen] = i; // diagonal entry 249d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 250d460d7bfSJunchao Zhang Mi[i + 1] = Mi[i] + llen + ulen; 251d460d7bfSJunchao Zhang } 252d460d7bfSJunchao Zhang // Copy M (L,U) from host to device 253f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 254f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 255f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 256f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 257f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 258d460d7bfSJunchao Zhang 259d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 260d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 261d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 262d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 263d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 264d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 265d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 266d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 267d460d7bfSJunchao Zhang 268d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 269d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 270d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 271d460d7bfSJunchao Zhang 272d460d7bfSJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 273d460d7bfSJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 274d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 275d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 276d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 277d460d7bfSJunchao Zhang 278d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 279f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 280f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 281d460d7bfSJunchao Zhang 282d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 283d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 284d460d7bfSJunchao Zhang 285d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 286d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 287d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 288d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 289d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 290d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 291d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 292d460d7bfSJunchao Zhang 293d460d7bfSJunchao Zhang // Record for reuse 294d460d7bfSJunchao Zhang fs->csrRowPtr_h = Mi; 295d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 296d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 297d460d7bfSJunchao Zhang } 298d460d7bfSJunchao Zhang // Copy the value 299d460d7bfSJunchao Zhang Mi = fs->csrRowPtr_h; 300d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 301d460d7bfSJunchao Zhang Mnz = Mi[m]; 302d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 303d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 304d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 305d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 306d460d7bfSJunchao Zhang Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 307d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 308d460d7bfSJunchao Zhang } 309d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 310d460d7bfSJunchao Zhang 311d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 312d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 313d460d7bfSJunchao Zhang 314d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 315d460d7bfSJunchao Zhang 316d460d7bfSJunchao Zhang // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 317d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 318d460d7bfSJunchao Zhang } 319d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 320d460d7bfSJunchao Zhang } 321d460d7bfSJunchao Zhang #else 322d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 323d71ae5a4SJacob Faibussowitsch { 3249ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3259ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3269ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 327aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 3289ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 3299ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3309ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3319ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 3329ae82921SPaul Mullowney 3339ae82921SPaul Mullowney PetscFunctionBegin; 3343ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 335c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3369ae82921SPaul Mullowney try { 3379ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3389ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 339da79fbbcSStefano Zampini if (!loTriFactor) { 3402cbc15d9SMark PetscScalar *AALo; 3412cbc15d9SMark 3429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 3439ae82921SPaul Mullowney 3449ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 3469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 3479ae82921SPaul Mullowney 3489ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3499ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 3509ae82921SPaul Mullowney AiLo[n] = nzLower; 3519ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 3529ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 3539ae82921SPaul Mullowney v = aa; 3549ae82921SPaul Mullowney vi = aj; 3559ae82921SPaul Mullowney offset = 1; 3569ae82921SPaul Mullowney rowOffset = 1; 3579ae82921SPaul Mullowney for (i = 1; i < n; i++) { 3589ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 359e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3609ae82921SPaul Mullowney AiLo[i] = rowOffset; 3619ae82921SPaul Mullowney rowOffset += nz + 1; 3629ae82921SPaul Mullowney 363f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 364f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 3659ae82921SPaul Mullowney 3669ae82921SPaul Mullowney offset += nz; 3679ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 3689ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 3699ae82921SPaul Mullowney offset += 1; 3709ae82921SPaul Mullowney 3719ae82921SPaul Mullowney v += nz; 3729ae82921SPaul Mullowney vi += nz; 3739ae82921SPaul Mullowney } 3742205254eSKarl Rupp 375aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3769566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 377da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 378aa372e3fSPaul Mullowney /* Create the matrix description */ 3799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3829566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 383afb2bd1cSJunchao Zhang #else 3849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 385afb2bd1cSJunchao Zhang #endif 3869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3879566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 388aa372e3fSPaul Mullowney 389aa372e3fSPaul Mullowney /* set the operation */ 390aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 391aa372e3fSPaul Mullowney 392aa372e3fSPaul Mullowney /* set the matrix */ 393aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 394aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 395aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 396aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 397aa372e3fSPaul Mullowney 398aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 399aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 400aa372e3fSPaul Mullowney 401aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 402aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 403aa372e3fSPaul Mullowney 404aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 405aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 406aa372e3fSPaul Mullowney 407afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4089566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 409261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 4101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4119371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4129371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 4139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 414afb2bd1cSJunchao Zhang #endif 415afb2bd1cSJunchao Zhang 416aa372e3fSPaul Mullowney /* perform the solve analysis */ 4179371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4189f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 4199566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4209566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 421aa372e3fSPaul Mullowney 422da79fbbcSStefano Zampini /* assign the pointer */ 423aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 4242cbc15d9SMark loTriFactor->AA_h = AALo; 4259566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4269566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4279566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 428da79fbbcSStefano Zampini } else { /* update values only */ 42948a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 430da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4312cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 432da79fbbcSStefano Zampini v = aa; 433da79fbbcSStefano Zampini vi = aj; 434da79fbbcSStefano Zampini offset = 1; 435da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 436da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 437f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 438da79fbbcSStefano Zampini offset += nz; 4392cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 440da79fbbcSStefano Zampini offset += 1; 441da79fbbcSStefano Zampini v += nz; 442da79fbbcSStefano Zampini } 4432cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 4449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 445da79fbbcSStefano Zampini } 446d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 447d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 448d71ae5a4SJacob Faibussowitsch } 4499ae82921SPaul Mullowney } 4503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4519ae82921SPaul Mullowney } 4529ae82921SPaul Mullowney 453d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 454d71ae5a4SJacob Faibussowitsch { 4559ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4569ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4579ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 458aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 4599ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 4609ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 4619ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4629ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 4639ae82921SPaul Mullowney 4649ae82921SPaul Mullowney PetscFunctionBegin; 4653ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 466c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4679ae82921SPaul Mullowney try { 4689ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4699ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 470da79fbbcSStefano Zampini if (!upTriFactor) { 4712cbc15d9SMark PetscScalar *AAUp; 4722cbc15d9SMark 4739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4742cbc15d9SMark 4759ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4789ae82921SPaul Mullowney 4799ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4809ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4819ae82921SPaul Mullowney AiUp[n] = nzUpper; 4829ae82921SPaul Mullowney offset = nzUpper; 4839ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4849ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4859ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4869ae82921SPaul Mullowney 487e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4889ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4899ae82921SPaul Mullowney 490e057df02SPaul Mullowney /* decrement the offset */ 4919ae82921SPaul Mullowney offset -= (nz + 1); 4929ae82921SPaul Mullowney 493e057df02SPaul Mullowney /* first, set the diagonal elements */ 4949ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 49509f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4969ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4979ae82921SPaul Mullowney 498f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 499f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 5009ae82921SPaul Mullowney } 5012205254eSKarl Rupp 502aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5039566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 504da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5052205254eSKarl Rupp 506aa372e3fSPaul Mullowney /* Create the matrix description */ 5079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 511afb2bd1cSJunchao Zhang #else 5129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 513afb2bd1cSJunchao Zhang #endif 5149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 516aa372e3fSPaul Mullowney 517aa372e3fSPaul Mullowney /* set the operation */ 518aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 519aa372e3fSPaul Mullowney 520aa372e3fSPaul Mullowney /* set the matrix */ 521aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 522aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 523aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 524aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 525aa372e3fSPaul Mullowney 526aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 527aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 528aa372e3fSPaul Mullowney 529aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 530aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 531aa372e3fSPaul Mullowney 532aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 533aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 534aa372e3fSPaul Mullowney 535afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 5369566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 537261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 5381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5409371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 5419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 542afb2bd1cSJunchao Zhang #endif 543afb2bd1cSJunchao Zhang 544aa372e3fSPaul Mullowney /* perform the solve analysis */ 5459371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5469f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 5479f7ba44dSJacob Faibussowitsch 5489566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5499566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 550aa372e3fSPaul Mullowney 551da79fbbcSStefano Zampini /* assign the pointer */ 552aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 5532cbc15d9SMark upTriFactor->AA_h = AAUp; 5549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5559566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5569566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 557da79fbbcSStefano Zampini } else { 55848a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 559da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 560da79fbbcSStefano Zampini offset = nzUpper; 561da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 562da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 563da79fbbcSStefano Zampini 564da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 565da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 566da79fbbcSStefano Zampini 567da79fbbcSStefano Zampini /* decrement the offset */ 568da79fbbcSStefano Zampini offset -= (nz + 1); 569da79fbbcSStefano Zampini 570da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5712cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 572f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 573da79fbbcSStefano Zampini } 5742cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5759566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 576da79fbbcSStefano Zampini } 577d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 578d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 579d71ae5a4SJacob Faibussowitsch } 5809ae82921SPaul Mullowney } 5813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5829ae82921SPaul Mullowney } 583d460d7bfSJunchao Zhang #endif 5849ae82921SPaul Mullowney 585d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 586d71ae5a4SJacob Faibussowitsch { 5879ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5889ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 589c9e33d71SJunchao Zhang IS isrow = a->row, isicol = a->icol; 5909ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5919ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5929ae82921SPaul Mullowney 5939ae82921SPaul Mullowney PetscFunctionBegin; 59428b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 595b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 596d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 597d460d7bfSJunchao Zhang #else 5989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 600ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 601d460d7bfSJunchao Zhang #endif 602d460d7bfSJunchao Zhang 603aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 6049ae82921SPaul Mullowney 605d460d7bfSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 606e057df02SPaul Mullowney /* lower triangular indices */ 6079566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 608da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 609da79fbbcSStefano Zampini const PetscInt *r; 610da79fbbcSStefano Zampini 6119566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 612aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 613aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 6149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 6159566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 616da79fbbcSStefano Zampini } 6179ae82921SPaul Mullowney 618e057df02SPaul Mullowney /* upper triangular indices */ 619c9e33d71SJunchao Zhang PetscCall(ISIdentity(isicol, &col_identity)); 620da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 621da79fbbcSStefano Zampini const PetscInt *c; 622da79fbbcSStefano Zampini 623c9e33d71SJunchao Zhang PetscCall(ISGetIndices(isicol, &c)); 624aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 625aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 626c9e33d71SJunchao Zhang PetscCall(ISRestoreIndices(isicol, &c)); 6279566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 628da79fbbcSStefano Zampini } 6293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6309ae82921SPaul Mullowney } 6319ae82921SPaul Mullowney 632b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 633d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 634d460d7bfSJunchao Zhang { 635d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 636d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 637d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 638d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 639d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 640d460d7bfSJunchao Zhang PetscInt *Mj, Mnz; 641d460d7bfSJunchao Zhang PetscScalar *Ma, *D; 642d460d7bfSJunchao Zhang 643d460d7bfSJunchao Zhang PetscFunctionBegin; 644d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 645d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 646d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 647d460d7bfSJunchao Zhang // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 648d460d7bfSJunchao Zhang Mnz = Ai[m]; // Unz (with the unit diagonal) 649d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 650d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 651d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m, &D)); // the diagonal 652d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 653d460d7bfSJunchao Zhang PetscInt ulen = Ai[i + 1] - Ai[i]; 654d460d7bfSJunchao Zhang Mj[Ai[i]] = i; // diagonal entry 655d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 656d460d7bfSJunchao Zhang } 657d460d7bfSJunchao Zhang // Copy M (U) from host to device 658f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 659f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 660f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 661f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 662d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 663d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 664d460d7bfSJunchao Zhang 665d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 666d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 667d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 668d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 669d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 670d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 671d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 672d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 673d460d7bfSJunchao Zhang 674d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 675d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 676d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 677d460d7bfSJunchao Zhang 678d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 679f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 680f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 681d460d7bfSJunchao Zhang 682d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 683d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 684d460d7bfSJunchao Zhang 685d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers 686d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 687d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 688d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 689d460d7bfSJunchao Zhang 690aaa8cc7dSPierre Jolivet PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 691d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 692d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 693d460d7bfSJunchao Zhang 694d460d7bfSJunchao Zhang // Record for reuse 695d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 696d460d7bfSJunchao Zhang fs->diag_h = D; 697d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 698d460d7bfSJunchao Zhang } 699d460d7bfSJunchao Zhang // Copy the value 700d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 701d460d7bfSJunchao Zhang D = fs->diag_h; 702d460d7bfSJunchao Zhang Mnz = Ai[m]; 703d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 704d460d7bfSJunchao Zhang D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 705d460d7bfSJunchao Zhang Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 706d460d7bfSJunchao Zhang for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 707d460d7bfSJunchao Zhang } 708d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 709d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 710d460d7bfSJunchao Zhang 711d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 712d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 713d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 714d460d7bfSJunchao Zhang } 715d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 716d460d7bfSJunchao Zhang } 717d460d7bfSJunchao Zhang 718d460d7bfSJunchao Zhang // Solve Ut D U x = b 719d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 720d460d7bfSJunchao Zhang { 721d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 722d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 723d460d7bfSJunchao Zhang const PetscScalar *barray; 724d460d7bfSJunchao Zhang PetscScalar *xarray; 725d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 726d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 727d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 728d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 729d460d7bfSJunchao Zhang 730d460d7bfSJunchao Zhang PetscFunctionBegin; 731d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 732d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 733d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 734d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 735d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 736d460d7bfSJunchao Zhang 737d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 738d460d7bfSJunchao Zhang if (fs->rpermIndices) { 739d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 740d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 741d460d7bfSJunchao Zhang } else { 742d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 743d460d7bfSJunchao Zhang } 744d460d7bfSJunchao Zhang 745d460d7bfSJunchao Zhang // Solve Ut Y = X 746d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 747d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 748d460d7bfSJunchao Zhang 749d460d7bfSJunchao Zhang // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 750d460d7bfSJunchao Zhang // It is basically a vector element-wise multiplication, but cublas does not have it! 751d460d7bfSJunchao Zhang PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 752d460d7bfSJunchao Zhang 753d460d7bfSJunchao Zhang // Solve U X = Y 754d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 755d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 756d460d7bfSJunchao Zhang } else { 757d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 758d460d7bfSJunchao Zhang } 759d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 760d460d7bfSJunchao Zhang 761d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 762d460d7bfSJunchao Zhang if (fs->cpermIndices) { 763d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 764d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 765d460d7bfSJunchao Zhang } 766d460d7bfSJunchao Zhang 767d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 768d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 769d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 770d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 771d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 772d460d7bfSJunchao Zhang } 773d460d7bfSJunchao Zhang #else 774d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 775d71ae5a4SJacob Faibussowitsch { 776087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 777087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 778aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 779aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 780087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 781087f3262SPaul Mullowney PetscScalar *AAUp; 782087f3262SPaul Mullowney PetscScalar *AALo; 783087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 784087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 785087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 786087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 787087f3262SPaul Mullowney 788087f3262SPaul Mullowney PetscFunctionBegin; 7893ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 790c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 791087f3262SPaul Mullowney try { 7929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 7939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 794da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 795087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 7979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 798087f3262SPaul Mullowney 799087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 800087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 801087f3262SPaul Mullowney AiUp[n] = nzUpper; 802087f3262SPaul Mullowney offset = 0; 803087f3262SPaul Mullowney for (i = 0; i < n; i++) { 804087f3262SPaul Mullowney /* set the pointers */ 805087f3262SPaul Mullowney v = aa + ai[i]; 806087f3262SPaul Mullowney vj = aj + ai[i]; 807087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 808087f3262SPaul Mullowney 809087f3262SPaul Mullowney /* first, set the diagonal elements */ 810087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 81109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 812087f3262SPaul Mullowney AiUp[i] = offset; 81309f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 814087f3262SPaul Mullowney 815087f3262SPaul Mullowney offset += 1; 816087f3262SPaul Mullowney if (nz > 0) { 817f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 818f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 819087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 820087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 821087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 822087f3262SPaul Mullowney } 823087f3262SPaul Mullowney offset += nz; 824087f3262SPaul Mullowney } 825087f3262SPaul Mullowney } 826087f3262SPaul Mullowney 827aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8289566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 829da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 830087f3262SPaul Mullowney 831aa372e3fSPaul Mullowney /* Create the matrix description */ 8329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 8339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8359566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 836afb2bd1cSJunchao Zhang #else 8379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 838afb2bd1cSJunchao Zhang #endif 8399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 841087f3262SPaul Mullowney 842aa372e3fSPaul Mullowney /* set the matrix */ 843aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 844aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 845aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 846aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 847aa372e3fSPaul Mullowney 848aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 849aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 850aa372e3fSPaul Mullowney 851aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 852aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 853aa372e3fSPaul Mullowney 854aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 855aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 856aa372e3fSPaul Mullowney 857afb2bd1cSJunchao Zhang /* set the operation */ 858afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 859afb2bd1cSJunchao Zhang 860afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8619566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 862261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 8631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8649371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8659371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 8669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 867afb2bd1cSJunchao Zhang #endif 868afb2bd1cSJunchao Zhang 869aa372e3fSPaul Mullowney /* perform the solve analysis */ 8709371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8719f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 8729f7ba44dSJacob Faibussowitsch 8739566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8749566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 875aa372e3fSPaul Mullowney 876da79fbbcSStefano Zampini /* assign the pointer */ 877aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 878aa372e3fSPaul Mullowney 879aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8809566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 881da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 882aa372e3fSPaul Mullowney 883aa372e3fSPaul Mullowney /* Create the matrix description */ 8849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8859566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8879566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 888afb2bd1cSJunchao Zhang #else 8899566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 890afb2bd1cSJunchao Zhang #endif 8919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8929566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 893aa372e3fSPaul Mullowney 894aa372e3fSPaul Mullowney /* set the operation */ 895aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 896aa372e3fSPaul Mullowney 897aa372e3fSPaul Mullowney /* set the matrix */ 898aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 899aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 900aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 901aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 902aa372e3fSPaul Mullowney 903aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 904aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 905aa372e3fSPaul Mullowney 906aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 907aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 908aa372e3fSPaul Mullowney 909aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 910aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 911aa372e3fSPaul Mullowney 912afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9139566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 914261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 9151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9169371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9179371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 9189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 919afb2bd1cSJunchao Zhang #endif 920afb2bd1cSJunchao Zhang 921aa372e3fSPaul Mullowney /* perform the solve analysis */ 9229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9239f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 9249f7ba44dSJacob Faibussowitsch 9259566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9269566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 927aa372e3fSPaul Mullowney 928da79fbbcSStefano Zampini /* assign the pointer */ 929aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 930087f3262SPaul Mullowney 9319566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 9329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 9339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 934da79fbbcSStefano Zampini } else { 935da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 936da79fbbcSStefano Zampini offset = 0; 937da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 938da79fbbcSStefano Zampini /* set the pointers */ 939da79fbbcSStefano Zampini v = aa + ai[i]; 940da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 941da79fbbcSStefano Zampini 942da79fbbcSStefano Zampini /* first, set the diagonal elements */ 943da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 944da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 945da79fbbcSStefano Zampini 946da79fbbcSStefano Zampini offset += 1; 947da79fbbcSStefano Zampini if (nz > 0) { 948f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 949da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 950da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 951da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 952da79fbbcSStefano Zampini } 953da79fbbcSStefano Zampini offset += nz; 954da79fbbcSStefano Zampini } 955da79fbbcSStefano Zampini } 95628b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 95728b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 958da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 959da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 9609566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 961da79fbbcSStefano Zampini } 9629566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 964d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 965d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 966d71ae5a4SJacob Faibussowitsch } 967087f3262SPaul Mullowney } 9683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 969087f3262SPaul Mullowney } 970d460d7bfSJunchao Zhang #endif 971087f3262SPaul Mullowney 972d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 973d71ae5a4SJacob Faibussowitsch { 974087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 975087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 976087f3262SPaul Mullowney IS ip = a->row; 977087f3262SPaul Mullowney PetscBool perm_identity; 978087f3262SPaul Mullowney PetscInt n = A->rmap->n; 979087f3262SPaul Mullowney 980087f3262SPaul Mullowney PetscFunctionBegin; 98128b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 982d460d7bfSJunchao Zhang 983b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 984d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 985d460d7bfSJunchao Zhang #else 9869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 987ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 988d460d7bfSJunchao Zhang #endif 989aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 990aa372e3fSPaul Mullowney 991da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 992da79fbbcSStefano Zampini 993087f3262SPaul Mullowney /* lower triangular indices */ 9949566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 995087f3262SPaul Mullowney if (!perm_identity) { 9964e4bbfaaSStefano Zampini IS iip; 997da79fbbcSStefano Zampini const PetscInt *irip, *rip; 9984e4bbfaaSStefano Zampini 9999566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 10009566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 10019566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 1002aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1003aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1004aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 10054e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 10069566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 10079566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 10089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 10099566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1010da79fbbcSStefano Zampini } 10113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1012087f3262SPaul Mullowney } 1013087f3262SPaul Mullowney 1014d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1015d71ae5a4SJacob Faibussowitsch { 1016087f3262SPaul Mullowney PetscFunctionBegin; 10179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 10189566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1019ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1020d460d7bfSJunchao Zhang 1021b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1022d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1024d460d7bfSJunchao Zhang #else 1025087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1026d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1027d460d7bfSJunchao Zhang IS ip = b->row; 1028d460d7bfSJunchao Zhang PetscBool perm_identity; 1029d460d7bfSJunchao Zhang 10309566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 1031087f3262SPaul Mullowney if (perm_identity) { 1032087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1033087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1034087f3262SPaul Mullowney } else { 1035087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1036087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1037d460d7bfSJunchao Zhang } 1038d460d7bfSJunchao Zhang #endif 10394e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10404e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1041087f3262SPaul Mullowney 1042087f3262SPaul Mullowney /* get the triangular factors */ 10439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 10443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1045087f3262SPaul Mullowney } 10469ae82921SPaul Mullowney 1047b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1048d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1049d71ae5a4SJacob Faibussowitsch { 1050bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1051aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1052aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1053da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1054da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1055aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1056aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1057aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1058aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 1059b175d8bbSPaul Mullowney 1060bda325fcSPaul Mullowney PetscFunctionBegin; 1061aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10629566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1063da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1064aa372e3fSPaul Mullowney 1065aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1066aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1067aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 10689371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1069aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1070aa372e3fSPaul Mullowney 1071aa372e3fSPaul Mullowney /* Create the matrix description */ 10729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1077aa372e3fSPaul Mullowney 1078aa372e3fSPaul Mullowney /* set the operation */ 1079aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1080aa372e3fSPaul Mullowney 1081aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1082aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1083afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1084afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1085aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1086afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1087afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1088afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1089aa372e3fSPaul Mullowney 1090aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1091afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10929371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 10939371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10949371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10959566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1096afb2bd1cSJunchao Zhang #endif 1097afb2bd1cSJunchao Zhang 10989566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10999f7ba44dSJacob Faibussowitsch { 11009f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11019f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 11029371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1103afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11049f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1105afb2bd1cSJunchao Zhang #else 11069f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1107afb2bd1cSJunchao Zhang #endif 11089f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11099f7ba44dSJacob Faibussowitsch } 11109f7ba44dSJacob Faibussowitsch 11119566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11129566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1113aa372e3fSPaul Mullowney 1114afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11159566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1116261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 11171b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11189371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11199371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 11209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1121afb2bd1cSJunchao Zhang #endif 1122afb2bd1cSJunchao Zhang 1123afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11249371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11259f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 11269f7ba44dSJacob Faibussowitsch 11279566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11289566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1129aa372e3fSPaul Mullowney 1130da79fbbcSStefano Zampini /* assign the pointer */ 1131aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1132aa372e3fSPaul Mullowney 1133aa372e3fSPaul Mullowney /*********************************************/ 1134aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1135aa372e3fSPaul Mullowney /*********************************************/ 1136aa372e3fSPaul Mullowney 1137aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 11389566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1139da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1140aa372e3fSPaul Mullowney 1141aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1142aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1143aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 11449371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1145aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1146aa372e3fSPaul Mullowney 1147aa372e3fSPaul Mullowney /* Create the matrix description */ 11489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11529566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1153aa372e3fSPaul Mullowney 1154aa372e3fSPaul Mullowney /* set the operation */ 1155aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1156aa372e3fSPaul Mullowney 1157aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1158aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1159afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1160afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1161aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1164afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1165aa372e3fSPaul Mullowney 1166aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1167afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11689371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 11699371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11709371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1172afb2bd1cSJunchao Zhang #endif 1173afb2bd1cSJunchao Zhang 11749566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 11759f7ba44dSJacob Faibussowitsch { 11769f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11779f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 11789371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11809f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1181afb2bd1cSJunchao Zhang #else 11829f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1183afb2bd1cSJunchao Zhang #endif 11849f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11859f7ba44dSJacob Faibussowitsch } 1186d49cd2b7SBarry Smith 11879566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11889566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1189aa372e3fSPaul Mullowney 1190afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11919566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1192261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 11931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11949371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 11959371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 11969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1197afb2bd1cSJunchao Zhang #endif 1198afb2bd1cSJunchao Zhang 1199afb2bd1cSJunchao Zhang /* perform the solve analysis */ 12005f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 12019371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 12029f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1203d49cd2b7SBarry Smith 12049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 12059566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1206aa372e3fSPaul Mullowney 1207da79fbbcSStefano Zampini /* assign the pointer */ 1208aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 12093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1210bda325fcSPaul Mullowney } 1211d460d7bfSJunchao Zhang #endif 1212bda325fcSPaul Mullowney 12139371c9d4SSatish Balay struct PetscScalarToPetscInt { 12149371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1215a49f1ed0SStefano Zampini }; 1216a49f1ed0SStefano Zampini 1217d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1218d71ae5a4SJacob Faibussowitsch { 1219aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1220a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1221bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1222bda325fcSPaul Mullowney cusparseStatus_t stat; 1223aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1224b175d8bbSPaul Mullowney 1225bda325fcSPaul Mullowney PetscFunctionBegin; 12269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1227a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 122828b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1229a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 123008401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 12313ba16761SJacob Faibussowitsch if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 12329566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 12339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 123448a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1235a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1236aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1238aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1241aa372e3fSPaul Mullowney 1242b06137fdSPaul Mullowney /* set alpha and beta */ 1243f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1244f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1245f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 12469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1249b06137fdSPaul Mullowney 1250aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1251aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1252a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1253554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1254554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1255aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1256a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1257aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1258aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1259a3fdcf43SKarl Rupp 1260ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 126181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1262afb2bd1cSJunchao Zhang 1263afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 12643606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 12659371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12669371c9d4SSatish Balay indexBase, cusparse_scalartype); 12679371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12683606e59fSJunchao Zhang #else 12693606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12703606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12713606e59fSJunchao Zhang 12723606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12733606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12743606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12753606e59fSJunchao Zhang */ 12763606e59fSJunchao Zhang if (matrixT->num_entries) { 12779371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 12789371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12793606e59fSJunchao Zhang 12803606e59fSJunchao Zhang } else { 12813606e59fSJunchao Zhang matstructT->matDescr = NULL; 12823606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 12833606e59fSJunchao Zhang } 12843606e59fSJunchao Zhang #endif 1285afb2bd1cSJunchao Zhang #endif 1286aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1287afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1288afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1289afb2bd1cSJunchao Zhang #else 1290aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 129151c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 129251c6d536SStefano Zampini /* First convert HYB to CSR */ 1293aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1294aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1295aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1296aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1297aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1298aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1299aa372e3fSPaul Mullowney 13009371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 13019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1302aa372e3fSPaul Mullowney 1303aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1304aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1305aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1306aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1307aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1308aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1309aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1310aa372e3fSPaul Mullowney 13119371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 13129371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13139371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1314aa372e3fSPaul Mullowney 1315aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1316aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 13189371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 13199371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 13209371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1321aa372e3fSPaul Mullowney 1322aa372e3fSPaul Mullowney /* assign the pointer */ 1323aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13241a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1325aa372e3fSPaul Mullowney /* delete temporaries */ 1326aa372e3fSPaul Mullowney if (tempT) { 1327aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1328aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1329aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1330aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1331087f3262SPaul Mullowney } 1332aa372e3fSPaul Mullowney if (temp) { 1333aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1334aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1335aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1336aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1337aa372e3fSPaul Mullowney } 1338afb2bd1cSJunchao Zhang #endif 1339aa372e3fSPaul Mullowney } 1340a49f1ed0SStefano Zampini } 1341a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1342a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1343a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 134428b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 134528b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 134628b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 134728b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 134828b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 134928b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 135028b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 135128b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1352a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1353a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1354a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 13559566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1356a49f1ed0SStefano Zampini } 1357a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1358a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1359792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1360a49f1ed0SStefano Zampini 1361a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1362a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1363a49f1ed0SStefano Zampini void *csr2cscBuffer; 1364a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 13659371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 13669371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 13679371c9d4SSatish Balay PetscCallCUSPARSE(stat); 13689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1369a49f1ed0SStefano Zampini #endif 1370a49f1ed0SStefano Zampini 13711a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13721a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13731a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13741a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13751a2c6b5cSJunchao Zhang 13761a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13771a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13781a2c6b5cSJunchao Zhang */ 13799371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1380a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13819371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 13829371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1383a49f1ed0SStefano Zampini #else 13849371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1386a49f1ed0SStefano Zampini #endif 13871a2c6b5cSJunchao Zhang } else { 13881a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 13891a2c6b5cSJunchao Zhang } 13901a2c6b5cSJunchao Zhang 1391a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1392792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1393a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13949566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1395a49f1ed0SStefano Zampini #endif 1396a49f1ed0SStefano Zampini } 13979371c9d4SSatish Balay PetscCallThrust( 13989371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1399a49f1ed0SStefano Zampini } 14009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14019566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1402213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1403213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1404aa372e3fSPaul Mullowney /* assign the pointer */ 1405aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 14061a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 14073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1408bda325fcSPaul Mullowney } 1409bda325fcSPaul Mullowney 1410b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1411d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1412d460d7bfSJunchao Zhang { 1413d460d7bfSJunchao Zhang const PetscScalar *barray; 1414d460d7bfSJunchao Zhang PetscScalar *xarray; 1415d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1416d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1417d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1418d460d7bfSJunchao Zhang const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1419d460d7bfSJunchao Zhang const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1420d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1421d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1422d460d7bfSJunchao Zhang 1423d460d7bfSJunchao Zhang PetscFunctionBegin; 1424d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1425d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1426d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1427d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1428d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1429d460d7bfSJunchao Zhang 1430d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1431d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1432d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1433d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1434d460d7bfSJunchao Zhang } else { 1435d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1436d460d7bfSJunchao Zhang } 1437d460d7bfSJunchao Zhang 1438d460d7bfSJunchao Zhang // Solve L Y = X 1439d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1440d460d7bfSJunchao Zhang // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1441d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1442d460d7bfSJunchao Zhang 1443d460d7bfSJunchao Zhang // Solve U X = Y 1444d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1445d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1446d460d7bfSJunchao Zhang } else { 1447d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1448d460d7bfSJunchao Zhang } 1449d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1450d460d7bfSJunchao Zhang 1451d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1452d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1453d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1454d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1455d460d7bfSJunchao Zhang } 1456d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1457d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1458d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1459d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1460d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1461d460d7bfSJunchao Zhang } 1462d460d7bfSJunchao Zhang 1463d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1464d460d7bfSJunchao Zhang { 1465d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1466d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1467d460d7bfSJunchao Zhang const PetscScalar *barray; 1468d460d7bfSJunchao Zhang PetscScalar *xarray; 1469d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1470d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1471d460d7bfSJunchao Zhang const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1472d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1473d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1474d460d7bfSJunchao Zhang 1475d460d7bfSJunchao Zhang PetscFunctionBegin; 1476d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1477d460d7bfSJunchao Zhang if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1478d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1479d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1480d460d7bfSJunchao Zhang fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1481d460d7bfSJunchao Zhang 1482d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1483d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1484d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1485d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1486d460d7bfSJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 1487d460d7bfSJunchao Zhang } 1488d460d7bfSJunchao Zhang 1489d460d7bfSJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 1490d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1491d460d7bfSJunchao Zhang 1492d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1493d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1494d460d7bfSJunchao Zhang } 1495d460d7bfSJunchao Zhang 1496d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1497d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1498d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1499d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1500d460d7bfSJunchao Zhang 1501d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1502d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1503d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1504d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1505d460d7bfSJunchao Zhang } else { 1506d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1507d460d7bfSJunchao Zhang } 1508d460d7bfSJunchao Zhang 1509d460d7bfSJunchao Zhang // Solve Ut Y = X 1510d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1511d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1512d460d7bfSJunchao Zhang 1513d460d7bfSJunchao Zhang // Solve Lt X = Y 1514d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1515d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1516d460d7bfSJunchao Zhang } else { 1517d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1518d460d7bfSJunchao Zhang } 1519d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1520d460d7bfSJunchao Zhang 1521d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1522d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1523d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1524d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1525d460d7bfSJunchao Zhang } 1526d460d7bfSJunchao Zhang 1527d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1528d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1529d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1530d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1531d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1532d460d7bfSJunchao Zhang } 1533d460d7bfSJunchao Zhang #else 1534a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1535d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1536d71ae5a4SJacob Faibussowitsch { 1537c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1538465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1539465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1540465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1541465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1542bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1543aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1546bda325fcSPaul Mullowney 1547bda325fcSPaul Mullowney PetscFunctionBegin; 1548aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1549aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1551aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1552aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1553bda325fcSPaul Mullowney } 1554bda325fcSPaul Mullowney 1555bda325fcSPaul Mullowney /* Get the GPU pointers */ 15569566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 15579566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1558c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1559c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1560bda325fcSPaul Mullowney 15619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1562aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 15639371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1564aa372e3fSPaul Mullowney 1565aa372e3fSPaul Mullowney /* First, solve U */ 15669f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 15679f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1568aa372e3fSPaul Mullowney 1569aa372e3fSPaul Mullowney /* Then, solve L */ 15709f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 15719f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1572aa372e3fSPaul Mullowney 1573aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 15749371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1575aa372e3fSPaul Mullowney 1576aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1577a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1578bda325fcSPaul Mullowney 1579bda325fcSPaul Mullowney /* restore */ 15809566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 15819566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 15829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 15843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1585bda325fcSPaul Mullowney } 1586bda325fcSPaul Mullowney 1587d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1588d71ae5a4SJacob Faibussowitsch { 1589465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1590465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1591bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1592aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1595bda325fcSPaul Mullowney 1596bda325fcSPaul Mullowney PetscFunctionBegin; 1597aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1598aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1600aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1601aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1602bda325fcSPaul Mullowney } 1603bda325fcSPaul Mullowney 1604bda325fcSPaul Mullowney /* Get the GPU pointers */ 16059566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16069566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1607bda325fcSPaul Mullowney 16089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1609aa372e3fSPaul Mullowney /* First, solve U */ 16109f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 16119f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1612aa372e3fSPaul Mullowney 1613aa372e3fSPaul Mullowney /* Then, solve L */ 16149f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 16159f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1616bda325fcSPaul Mullowney 1617bda325fcSPaul Mullowney /* restore */ 16189566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16199566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1623bda325fcSPaul Mullowney } 1624bda325fcSPaul Mullowney 1625d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1626d71ae5a4SJacob Faibussowitsch { 1627465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1628465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1629465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1630465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16319ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1632aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1633aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1634aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16359ae82921SPaul Mullowney 16369ae82921SPaul Mullowney PetscFunctionBegin; 1637e057df02SPaul Mullowney /* Get the GPU pointers */ 16389566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16399566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1640c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1641c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16429ae82921SPaul Mullowney 16439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1644aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 16459371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1646aa372e3fSPaul Mullowney 1647aa372e3fSPaul Mullowney /* Next, solve L */ 16489f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16499f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1650aa372e3fSPaul Mullowney 1651aa372e3fSPaul Mullowney /* Then, solve U */ 16529f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16539f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1654d49cd2b7SBarry Smith 16554e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 16569371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 16579ae82921SPaul Mullowney 16589566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16599566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16639ae82921SPaul Mullowney } 16649ae82921SPaul Mullowney 1665d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1666d71ae5a4SJacob Faibussowitsch { 1667465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1668465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16699ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1670aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1671aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1672aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16739ae82921SPaul Mullowney 16749ae82921SPaul Mullowney PetscFunctionBegin; 1675e057df02SPaul Mullowney /* Get the GPU pointers */ 16769566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16779566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 16789ae82921SPaul Mullowney 16799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1680aa372e3fSPaul Mullowney /* First, solve L */ 16819f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16829f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1683d49cd2b7SBarry Smith 1684aa372e3fSPaul Mullowney /* Next, solve U */ 16859f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16869f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 16879ae82921SPaul Mullowney 16889566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16899566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16939ae82921SPaul Mullowney } 1694d460d7bfSJunchao Zhang #endif 16959ae82921SPaul Mullowney 1696b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 16978eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1698d71ae5a4SJacob Faibussowitsch { 1699da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1700da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1701da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1702da112707SJunchao Zhang CsrMatrix *Acsr; 1703da112707SJunchao Zhang PetscInt m, nz; 1704da112707SJunchao Zhang PetscBool flg; 1705da112707SJunchao Zhang 1706da112707SJunchao Zhang PetscFunctionBegin; 1707da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1708da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1709da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1710da112707SJunchao Zhang } 1711da112707SJunchao Zhang 1712da112707SJunchao Zhang /* Copy A's value to fact */ 1713da112707SJunchao Zhang m = fact->rmap->n; 1714da112707SJunchao Zhang nz = aij->nz; 1715da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1716da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1717da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1718da112707SJunchao Zhang 1719bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeBegin()); 1720da112707SJunchao Zhang /* Factorize fact inplace */ 17219371c9d4SSatish Balay if (m) 17229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1723d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1724da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1725da112707SJunchao Zhang int numerical_zero; 1726da112707SJunchao Zhang cusparseStatus_t status; 1727da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1728da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1729da112707SJunchao Zhang } 1730da112707SJunchao Zhang 173112ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 173212ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 173312ba2bc6SJunchao Zhang */ 17349371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1735da112707SJunchao Zhang 17369371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1737da112707SJunchao Zhang 173812ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 173912ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 174012ba2bc6SJunchao Zhang 1741da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1742d460d7bfSJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1743d460d7bfSJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1744da112707SJunchao Zhang fact->ops->matsolve = NULL; 1745da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1746bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeEnd()); 1747da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 17483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1749da112707SJunchao Zhang } 1750da112707SJunchao Zhang 17518eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1752d71ae5a4SJacob Faibussowitsch { 1753da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1754da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1755da112707SJunchao Zhang PetscInt m, nz; 1756da112707SJunchao Zhang 1757da112707SJunchao Zhang PetscFunctionBegin; 1758da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1759da112707SJunchao Zhang PetscInt i; 1760da112707SJunchao Zhang PetscBool flg, missing; 1761da112707SJunchao Zhang 1762da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1763da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1764da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1765da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1766da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1767da112707SJunchao Zhang } 1768da112707SJunchao Zhang 1769da112707SJunchao Zhang /* Free the old stale stuff */ 1770da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1771da112707SJunchao Zhang 1772da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1773da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1774da112707SJunchao Zhang */ 1775da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1776da112707SJunchao Zhang 1777da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1778da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1779da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1780da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1781da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1782da112707SJunchao Zhang 1783da112707SJunchao Zhang aij->row = NULL; 1784da112707SJunchao Zhang aij->col = NULL; 1785da112707SJunchao Zhang 1786da112707SJunchao Zhang /* ====================================================================== */ 1787da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1788da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1789da112707SJunchao Zhang /* ====================================================================== */ 1790da112707SJunchao Zhang const int *Ai, *Aj; 1791da112707SJunchao Zhang 1792da112707SJunchao Zhang m = fact->rmap->n; 1793da112707SJunchao Zhang nz = aij->nz; 1794da112707SJunchao Zhang 1795f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1796f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1797f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1798d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1799d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1800d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801da112707SJunchao Zhang 1802da112707SJunchao Zhang /* ====================================================================== */ 1803da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1804da112707SJunchao Zhang /* ====================================================================== */ 1805da112707SJunchao Zhang cusparseFillMode_t fillMode; 1806da112707SJunchao Zhang cusparseDiagType_t diagType; 1807da112707SJunchao Zhang 1808da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1809da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1810da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1811da112707SJunchao Zhang 1812da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1813da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1814da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1815da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1816da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1817da112707SJunchao Zhang */ 1818da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1819da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1820d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18219371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1823da112707SJunchao Zhang 1824da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1825da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1826d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18279371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1829da112707SJunchao Zhang 1830da112707SJunchao Zhang /* ========================================================================= */ 1831da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1832da112707SJunchao Zhang /* ========================================================================= */ 1833da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 18349371c9d4SSatish Balay if (m) 18359371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1836d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1837da112707SJunchao Zhang 1838da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1839da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1840da112707SJunchao Zhang 1841da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1842da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1843da112707SJunchao Zhang 1844da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18459371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1846da112707SJunchao Zhang 1847da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 18489371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1849da112707SJunchao Zhang 1850da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 185112ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 185212ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 185312ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1854da112707SJunchao Zhang */ 185512ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 185612ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 185712ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1858da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 185912ba2bc6SJunchao Zhang } else { 186012ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 186112ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1862da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 186312ba2bc6SJunchao Zhang } 1864da112707SJunchao Zhang 1865da112707SJunchao Zhang /* ========================================================================== */ 1866da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1867da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1868da112707SJunchao Zhang /* ========================================================================== */ 1869da112707SJunchao Zhang int structural_zero; 1870da112707SJunchao Zhang cusparseStatus_t status; 1871da112707SJunchao Zhang 1872da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 18739371c9d4SSatish Balay if (m) 18749371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1875d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1876da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1877da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1878da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1879da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1880da112707SJunchao Zhang } 1881da112707SJunchao Zhang 1882da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 18830dd8c0acSJunchao Zhang { 1884da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 18850dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1886da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1887da112707SJunchao Zhang 1888da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1889da112707SJunchao Zhang Ai = Aseq->i; 1890da112707SJunchao Zhang Adiag = Aseq->diag; 1891da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1892da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1893da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1894da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1895da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1896da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1897da112707SJunchao Zhang */ 1898da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1899da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1900da112707SJunchao Zhang } 1901da112707SJunchao Zhang } 1902da112707SJunchao Zhang fs->numericFactFlops = flops; 19030dd8c0acSJunchao Zhang } 1904da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 19053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1906da112707SJunchao Zhang } 1907da112707SJunchao Zhang 1908d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1909d71ae5a4SJacob Faibussowitsch { 1910da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1911da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1912da112707SJunchao Zhang const PetscScalar *barray; 1913da112707SJunchao Zhang PetscScalar *xarray; 1914da112707SJunchao Zhang 1915da112707SJunchao Zhang PetscFunctionBegin; 1916da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1917da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1918da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1919da112707SJunchao Zhang 1920da112707SJunchao Zhang /* Solve L*y = b */ 1921da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1922da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 19239371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 19249371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1925da112707SJunchao Zhang 1926da112707SJunchao Zhang /* Solve Lt*x = y */ 1927da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 19289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 19299371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1930da112707SJunchao Zhang 1931da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1932da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1933da112707SJunchao Zhang 1934da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1935da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 19363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1937da112707SJunchao Zhang } 1938da112707SJunchao Zhang 19398eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1940d71ae5a4SJacob Faibussowitsch { 1941da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1942da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1943da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1944da112707SJunchao Zhang CsrMatrix *Acsr; 1945da112707SJunchao Zhang PetscInt m, nz; 1946da112707SJunchao Zhang PetscBool flg; 1947da112707SJunchao Zhang 1948da112707SJunchao Zhang PetscFunctionBegin; 1949da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1950da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1951da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1952da112707SJunchao Zhang } 1953da112707SJunchao Zhang 1954da112707SJunchao Zhang /* Copy A's value to fact */ 1955da112707SJunchao Zhang m = fact->rmap->n; 1956da112707SJunchao Zhang nz = aij->nz; 1957da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1958da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1959da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1960da112707SJunchao Zhang 1961da112707SJunchao Zhang /* Factorize fact inplace */ 1962da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1963da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1964da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1965da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1966da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1967da112707SJunchao Zhang */ 1968d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1969da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1970da112707SJunchao Zhang int numerical_zero; 1971da112707SJunchao Zhang cusparseStatus_t status; 1972da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1973da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1974da112707SJunchao Zhang } 1975da112707SJunchao Zhang 19769371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1977da112707SJunchao Zhang 1978da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1979da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1980da112707SJunchao Zhang */ 19819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1982da112707SJunchao Zhang 1983da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1984da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1985da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1986da112707SJunchao Zhang fact->ops->matsolve = NULL; 1987da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1988da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 19893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1990da112707SJunchao Zhang } 1991da112707SJunchao Zhang 19928eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1993d71ae5a4SJacob Faibussowitsch { 1994da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1995da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1996da112707SJunchao Zhang PetscInt m, nz; 1997da112707SJunchao Zhang 1998da112707SJunchao Zhang PetscFunctionBegin; 1999da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2000da112707SJunchao Zhang PetscInt i; 2001da112707SJunchao Zhang PetscBool flg, missing; 2002da112707SJunchao Zhang 2003da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2004da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2005da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2006da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 2007da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2008da112707SJunchao Zhang } 2009da112707SJunchao Zhang 2010da112707SJunchao Zhang /* Free the old stale stuff */ 2011da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2012da112707SJunchao Zhang 2013da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2014da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2015da112707SJunchao Zhang */ 2016da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2017da112707SJunchao Zhang 2018da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2019da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2020da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2021da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2022da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2023da112707SJunchao Zhang 2024da112707SJunchao Zhang aij->row = NULL; 2025da112707SJunchao Zhang aij->col = NULL; 2026da112707SJunchao Zhang 2027da112707SJunchao Zhang /* ====================================================================== */ 2028da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2029da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2030da112707SJunchao Zhang /* ====================================================================== */ 2031da112707SJunchao Zhang const int *Ai, *Aj; 2032da112707SJunchao Zhang 2033da112707SJunchao Zhang m = fact->rmap->n; 2034da112707SJunchao Zhang nz = aij->nz; 2035da112707SJunchao Zhang 2036f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2037f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2038da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2039da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2040d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2041d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2042da112707SJunchao Zhang 2043da112707SJunchao Zhang /* ====================================================================== */ 2044da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2045da112707SJunchao Zhang /* ====================================================================== */ 2046da112707SJunchao Zhang cusparseFillMode_t fillMode; 2047da112707SJunchao Zhang cusparseDiagType_t diagType; 2048da112707SJunchao Zhang 2049da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2050da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2051da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2052da112707SJunchao Zhang 2053da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2054da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2055da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2056da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2057da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2058da112707SJunchao Zhang */ 2059da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2060da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2061d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 20629371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 20639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2064da112707SJunchao Zhang 2065da112707SJunchao Zhang /* ========================================================================= */ 2066da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2067da112707SJunchao Zhang /* ========================================================================= */ 2068da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2069d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2070da112707SJunchao Zhang 2071da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2072da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2073da112707SJunchao Zhang 2074da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2075da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2076da112707SJunchao Zhang 2077da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 20789371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2079da112707SJunchao Zhang 2080da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 20819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2082da112707SJunchao Zhang 208312ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 208412ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 208512ba2bc6SJunchao Zhang */ 208612ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 208712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 208812ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 2089da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 209012ba2bc6SJunchao Zhang } else { 209112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 209212ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 209312ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 209412ba2bc6SJunchao Zhang } 2095da112707SJunchao Zhang 2096da112707SJunchao Zhang /* ========================================================================== */ 2097da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2098da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2099da112707SJunchao Zhang /* ========================================================================== */ 2100da112707SJunchao Zhang int structural_zero; 2101da112707SJunchao Zhang cusparseStatus_t status; 2102da112707SJunchao Zhang 2103da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2104d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2105da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2106da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2107da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2108da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2109da112707SJunchao Zhang } 2110da112707SJunchao Zhang 2111da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 21120dd8c0acSJunchao Zhang { 2113da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 21140dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 2115da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2116da112707SJunchao Zhang 2117da112707SJunchao Zhang Ai = Aseq->i; 2118da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2119da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 2120da112707SJunchao Zhang if (nzRow > 1) { 2121da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2122da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2123da112707SJunchao Zhang */ 2124da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 2125da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2126da112707SJunchao Zhang } 2127da112707SJunchao Zhang } 2128da112707SJunchao Zhang fs->numericFactFlops = flops; 21290dd8c0acSJunchao Zhang } 2130da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 21313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2132da112707SJunchao Zhang } 2133da112707SJunchao Zhang #endif 2134da112707SJunchao Zhang 2135d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2136d460d7bfSJunchao Zhang { 2137b820271fSJunchao Zhang // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2138b820271fSJunchao Zhang Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2139d460d7bfSJunchao Zhang 2140d460d7bfSJunchao Zhang PetscFunctionBegin; 2141d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2142d460d7bfSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2143d460d7bfSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 2144d460d7bfSJunchao Zhang 2145d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) { 2146b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2147d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2148d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2149d460d7bfSJunchao Zhang #else 2150d460d7bfSJunchao Zhang /* determine which version of MatSolve needs to be used. */ 2151d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2152d460d7bfSJunchao Zhang IS isrow = b->row, iscol = b->col; 2153d460d7bfSJunchao Zhang PetscBool row_identity, col_identity; 2154d460d7bfSJunchao Zhang 2155d460d7bfSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2156d460d7bfSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2157d460d7bfSJunchao Zhang if (row_identity && col_identity) { 2158d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2159d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2160d460d7bfSJunchao Zhang } else { 2161d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2162d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2163d460d7bfSJunchao Zhang } 2164d460d7bfSJunchao Zhang #endif 2165d460d7bfSJunchao Zhang } 2166d460d7bfSJunchao Zhang B->ops->matsolve = NULL; 2167d460d7bfSJunchao Zhang B->ops->matsolvetranspose = NULL; 2168d460d7bfSJunchao Zhang 2169d460d7bfSJunchao Zhang /* get the triangular factors */ 2170d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2171d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2172d460d7bfSJunchao Zhang } 2173d460d7bfSJunchao Zhang 2174d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2175d460d7bfSJunchao Zhang { 2176d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2177d460d7bfSJunchao Zhang 2178d460d7bfSJunchao Zhang PetscFunctionBegin; 2179d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2180d460d7bfSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2181d460d7bfSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2182d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2183d460d7bfSJunchao Zhang } 2184d460d7bfSJunchao Zhang 2185d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2186d71ae5a4SJacob Faibussowitsch { 2187da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2188da112707SJunchao Zhang 2189da112707SJunchao Zhang PetscFunctionBegin; 2190b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2191bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2192*f82ac72cSJunchao Zhang if (!info->factoronhost) { 2193da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2194da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2195bc996fdcSJunchao Zhang } 2196da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2197da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2198da112707SJunchao Zhang } else 2199da112707SJunchao Zhang #endif 2200da112707SJunchao Zhang { 2201da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2202da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2203da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2204da112707SJunchao Zhang } 22053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2206da112707SJunchao Zhang } 2207da112707SJunchao Zhang 2208d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2209d71ae5a4SJacob Faibussowitsch { 2210da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2211da112707SJunchao Zhang 2212da112707SJunchao Zhang PetscFunctionBegin; 2213b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2214bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 2215*f82ac72cSJunchao Zhang if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2216da112707SJunchao Zhang if (!info->levels && perm_identity) { 2217da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2218da112707SJunchao Zhang } else 2219da112707SJunchao Zhang #endif 2220da112707SJunchao Zhang { 2221da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2222da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2223da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2224da112707SJunchao Zhang } 22253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2226da112707SJunchao Zhang } 2227da112707SJunchao Zhang 2228d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2229d71ae5a4SJacob Faibussowitsch { 2230da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2231da112707SJunchao Zhang 2232da112707SJunchao Zhang PetscFunctionBegin; 2233da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2234da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2235da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 22363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2237da112707SJunchao Zhang } 2238da112707SJunchao Zhang 223966976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2240d71ae5a4SJacob Faibussowitsch { 2241841d4cb1SJunchao Zhang PetscFunctionBegin; 2242841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 22433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2244841d4cb1SJunchao Zhang } 2245841d4cb1SJunchao Zhang 2246841d4cb1SJunchao Zhang /*MC 2247841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 224811a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2249841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2250841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 225111a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2252841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2253841d4cb1SJunchao Zhang 2254841d4cb1SJunchao Zhang Level: beginner 2255841d4cb1SJunchao Zhang 22561cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 22572ef1f0ffSBarry Smith `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2258841d4cb1SJunchao Zhang M*/ 2259841d4cb1SJunchao Zhang 2260d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2261d71ae5a4SJacob Faibussowitsch { 2262841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2263841d4cb1SJunchao Zhang 2264841d4cb1SJunchao Zhang PetscFunctionBegin; 2265841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2266841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2267b820271fSJunchao Zhang (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2268841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2269841d4cb1SJunchao Zhang 2270841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2271841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2272841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2273841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2274841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2275841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2276841d4cb1SJunchao Zhang } else { 2277841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2278841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2279841d4cb1SJunchao Zhang } 2280841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2281841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2282841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2283841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2284841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2285841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2286841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2287841d4cb1SJunchao Zhang } else { 2288841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2289841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2290841d4cb1SJunchao Zhang } 2291841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2292841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2293841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2294841d4cb1SJunchao Zhang 2295841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2296841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2297f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 22983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2299841d4cb1SJunchao Zhang } 2300841d4cb1SJunchao Zhang 2301d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2302d71ae5a4SJacob Faibussowitsch { 23037e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 23047e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2305b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2306da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 23070dd8c0acSJunchao Zhang #endif 23087e8381f9SStefano Zampini 23097e8381f9SStefano Zampini PetscFunctionBegin; 23107e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 23119566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2312da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2313da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 23149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2315da112707SJunchao Zhang } 2316b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2317da112707SJunchao Zhang else if (fs->csrVal) { 2318da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2319da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2320da112707SJunchao Zhang } 2321da112707SJunchao Zhang #endif 23229371c9d4SSatish Balay else 23239371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 23249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 23259566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 23267e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 23277e8381f9SStefano Zampini } 23283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23297e8381f9SStefano Zampini } 23307e8381f9SStefano Zampini 2331d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2332d71ae5a4SJacob Faibussowitsch { 23337e8381f9SStefano Zampini PetscFunctionBegin; 23349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 233567a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 233767a45760SJunchao Zhang } 233867a45760SJunchao Zhang 2339d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2340d71ae5a4SJacob Faibussowitsch { 234167a45760SJunchao Zhang PetscFunctionBegin; 23427e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 234367a45760SJunchao Zhang *array = NULL; 23443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 234567a45760SJunchao Zhang } 234667a45760SJunchao Zhang 2347d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2348d71ae5a4SJacob Faibussowitsch { 234967a45760SJunchao Zhang PetscFunctionBegin; 23509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 235167a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 235367a45760SJunchao Zhang } 235467a45760SJunchao Zhang 23558eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2356d71ae5a4SJacob Faibussowitsch { 235767a45760SJunchao Zhang PetscFunctionBegin; 235867a45760SJunchao Zhang *array = NULL; 23593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 236067a45760SJunchao Zhang } 236167a45760SJunchao Zhang 2362d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2363d71ae5a4SJacob Faibussowitsch { 236467a45760SJunchao Zhang PetscFunctionBegin; 236567a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 236767a45760SJunchao Zhang } 236867a45760SJunchao Zhang 2369d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2370d71ae5a4SJacob Faibussowitsch { 237167a45760SJunchao Zhang PetscFunctionBegin; 237267a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 237367a45760SJunchao Zhang *array = NULL; 23743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23757e8381f9SStefano Zampini } 23767e8381f9SStefano Zampini 2377d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2378d71ae5a4SJacob Faibussowitsch { 23797ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 23807ee59b9bSJunchao Zhang CsrMatrix *matrix; 23817ee59b9bSJunchao Zhang 23827ee59b9bSJunchao Zhang PetscFunctionBegin; 23837ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23847ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 23857ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 23867ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 23877ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 23887ee59b9bSJunchao Zhang 23897ee59b9bSJunchao Zhang if (i) { 23907ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 23917ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 23927ee59b9bSJunchao Zhang #else 23937ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 23947ee59b9bSJunchao Zhang #endif 23957ee59b9bSJunchao Zhang } 23967ee59b9bSJunchao Zhang if (j) { 23977ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 23987ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 23997ee59b9bSJunchao Zhang #else 24007ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24017ee59b9bSJunchao Zhang #endif 24027ee59b9bSJunchao Zhang } 24037ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 24047ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 24053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24067ee59b9bSJunchao Zhang } 24077ee59b9bSJunchao Zhang 2408d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2409d71ae5a4SJacob Faibussowitsch { 2410aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 24117c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 24129ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2413213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2414aa372e3fSPaul Mullowney cusparseStatus_t stat; 2415abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 24169ae82921SPaul Mullowney 24179ae82921SPaul Mullowney PetscFunctionBegin; 241828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2419c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2420a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2421a49f1ed0SStefano Zampini CsrMatrix *matrix; 2422afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 242385ba7357SStefano Zampini 242408401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 24259566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2426afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 24279566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 2428f4f49eeaSPierre Jolivet PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 24299566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 243134d6c7a5SJose E. Roman } else { 2432abb89eb1SStefano Zampini PetscInt nnz; 24339566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 24359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 24367c700b8dSJunchao Zhang delete cusparsestruct->workVector; 243781902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2438a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2439a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 24409ae82921SPaul Mullowney try { 24419ae82921SPaul Mullowney if (a->compressedrow.use) { 24429ae82921SPaul Mullowney m = a->compressedrow.nrows; 24439ae82921SPaul Mullowney ii = a->compressedrow.i; 24449ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 24459ae82921SPaul Mullowney } else { 2446213423ffSJunchao Zhang m = A->rmap->n; 2447213423ffSJunchao Zhang ii = a->i; 2448e6e9a74fSStefano Zampini ridx = NULL; 24499ae82921SPaul Mullowney } 245008401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 24519371c9d4SSatish Balay if (!a->a) { 24529371c9d4SSatish Balay nnz = ii[m]; 24539371c9d4SSatish Balay both = PETSC_FALSE; 24549371c9d4SSatish Balay } else nnz = a->nz; 245508401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 24569ae82921SPaul Mullowney 245785ba7357SStefano Zampini /* create cusparse matrix */ 2458abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2459aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 24609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 24619566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 24629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 24639ae82921SPaul Mullowney 2464f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2465f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2466f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 24679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2471b06137fdSPaul Mullowney 2472aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2473aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2474aa372e3fSPaul Mullowney /* set the matrix */ 2475afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2476afb2bd1cSJunchao Zhang mat->num_rows = m; 2477afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2478abb89eb1SStefano Zampini mat->num_entries = nnz; 2479ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2480afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 24819ae82921SPaul Mullowney 2482ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2483abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2484aa372e3fSPaul Mullowney 2485ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2486abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2487aa372e3fSPaul Mullowney 2488aa372e3fSPaul Mullowney /* assign the pointer */ 2489afb2bd1cSJunchao Zhang matstruct->mat = mat; 2490afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2491afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 24929371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 24939371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 24949371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2495afb2bd1cSJunchao Zhang } 2496afb2bd1cSJunchao Zhang #endif 2497aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2498afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2499afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2500afb2bd1cSJunchao Zhang #else 2501afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2502afb2bd1cSJunchao Zhang mat->num_rows = m; 2503afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2504abb89eb1SStefano Zampini mat->num_entries = nnz; 2505ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2506afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2507aa372e3fSPaul Mullowney 2508ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2509abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2510aa372e3fSPaul Mullowney 2511ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2512abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2513aa372e3fSPaul Mullowney 2514aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 25159566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 25169371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 25179371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 25189371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2519aa372e3fSPaul Mullowney /* assign the pointer */ 2520aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2521aa372e3fSPaul Mullowney 2522afb2bd1cSJunchao Zhang if (mat) { 2523afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2524afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2525afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2526afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2527087f3262SPaul Mullowney } 2528afb2bd1cSJunchao Zhang #endif 2529087f3262SPaul Mullowney } 2530ca45077fSPaul Mullowney 2531aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2532213423ffSJunchao Zhang if (a->compressedrow.use) { 2533ee477ddbSJunchao Zhang PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2534ee477ddbSJunchao Zhang PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2535aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2536213423ffSJunchao Zhang tmp = m; 2537213423ffSJunchao Zhang } else { 2538213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2539213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2540213423ffSJunchao Zhang tmp = 0; 2541213423ffSJunchao Zhang } 25429566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2543aa372e3fSPaul Mullowney 2544aa372e3fSPaul Mullowney /* assign the pointer */ 2545aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 2546d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 2547d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2548d71ae5a4SJacob Faibussowitsch } 25499566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 25509566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 255134d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 255234d6c7a5SJose E. Roman } 2553abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 25549ae82921SPaul Mullowney } 25553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25569ae82921SPaul Mullowney } 25579ae82921SPaul Mullowney 25589371c9d4SSatish Balay struct VecCUDAPlusEquals { 2559aa372e3fSPaul Mullowney template <typename Tuple> 2560d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2561d71ae5a4SJacob Faibussowitsch { 2562aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2563aa372e3fSPaul Mullowney } 2564aa372e3fSPaul Mullowney }; 2565aa372e3fSPaul Mullowney 25669371c9d4SSatish Balay struct VecCUDAEquals { 25677e8381f9SStefano Zampini template <typename Tuple> 2568d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2569d71ae5a4SJacob Faibussowitsch { 25707e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 25717e8381f9SStefano Zampini } 25727e8381f9SStefano Zampini }; 25737e8381f9SStefano Zampini 25749371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2575e6e9a74fSStefano Zampini template <typename Tuple> 2576d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2577d71ae5a4SJacob Faibussowitsch { 2578e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2579e6e9a74fSStefano Zampini } 2580e6e9a74fSStefano Zampini }; 2581e6e9a74fSStefano Zampini 2582afb2bd1cSJunchao Zhang struct MatMatCusparse { 2583ccdfe979SStefano Zampini PetscBool cisdense; 2584ccdfe979SStefano Zampini PetscScalar *Bt; 2585ccdfe979SStefano Zampini Mat X; 2586fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2587fcdce8c4SStefano Zampini PetscLogDouble flops; 2588fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2589b4285af6SJunchao Zhang 2590afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2591fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2592afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2593afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2594afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2595afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2596b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2597b4285af6SJunchao Zhang void *dBuffer4; 2598b4285af6SJunchao Zhang void *dBuffer5; 2599b4285af6SJunchao Zhang #endif 2600fcdce8c4SStefano Zampini size_t mmBufferSize; 2601fcdce8c4SStefano Zampini void *mmBuffer; 2602fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2603fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2604afb2bd1cSJunchao Zhang #endif 2605afb2bd1cSJunchao Zhang }; 2606ccdfe979SStefano Zampini 2607d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2608d71ae5a4SJacob Faibussowitsch { 2609ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2610ccdfe979SStefano Zampini 2611ccdfe979SStefano Zampini PetscFunctionBegin; 26129566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2613fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2614afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 26159566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 26169566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 26179566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 26189566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2619b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 26209566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 26219566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2622b4285af6SJunchao Zhang #endif 26239566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 26249566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2625afb2bd1cSJunchao Zhang #endif 26269566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 26279566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 26283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2629ccdfe979SStefano Zampini } 2630ccdfe979SStefano Zampini 26314742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2632ccdfe979SStefano Zampini 2633d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2634d71ae5a4SJacob Faibussowitsch { 2635ccdfe979SStefano Zampini Mat_Product *product = C->product; 2636ccdfe979SStefano Zampini Mat A, B; 2637afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2638ccdfe979SStefano Zampini PetscBool flg, biscuda; 2639ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2640ccdfe979SStefano Zampini cusparseStatus_t stat; 2641ccdfe979SStefano Zampini cusparseOperation_t opA; 2642ccdfe979SStefano Zampini const PetscScalar *barray; 2643ccdfe979SStefano Zampini PetscScalar *carray; 2644ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2645ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2646ccdfe979SStefano Zampini CsrMatrix *csrmat; 2647ccdfe979SStefano Zampini 2648ccdfe979SStefano Zampini PetscFunctionBegin; 2649ccdfe979SStefano Zampini MatCheckProduct(C, 1); 265028b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2651ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2652ccdfe979SStefano Zampini A = product->A; 2653ccdfe979SStefano Zampini B = product->B; 26549566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 265528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2656ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2657ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 265828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 26599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2660ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2661ccdfe979SStefano Zampini switch (product->type) { 2662ccdfe979SStefano Zampini case MATPRODUCT_AB: 2663ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2664ccdfe979SStefano Zampini mat = cusp->mat; 2665ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2666ccdfe979SStefano Zampini m = A->rmap->n; 2667ccdfe979SStefano Zampini n = B->cmap->n; 2668ccdfe979SStefano Zampini break; 2669ccdfe979SStefano Zampini case MATPRODUCT_AtB: 26701a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2671e6e9a74fSStefano Zampini mat = cusp->mat; 2672e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2673e6e9a74fSStefano Zampini } else { 26749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2675ccdfe979SStefano Zampini mat = cusp->matTranspose; 2676ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2677e6e9a74fSStefano Zampini } 2678ccdfe979SStefano Zampini m = A->cmap->n; 2679ccdfe979SStefano Zampini n = B->cmap->n; 2680ccdfe979SStefano Zampini break; 2681ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2682ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2683ccdfe979SStefano Zampini mat = cusp->mat; 2684ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2685ccdfe979SStefano Zampini m = A->rmap->n; 2686ccdfe979SStefano Zampini n = B->rmap->n; 2687ccdfe979SStefano Zampini break; 2688d71ae5a4SJacob Faibussowitsch default: 2689d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2690ccdfe979SStefano Zampini } 269128b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2692ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2693ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 26949566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 26959566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2696cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2697afb2bd1cSJunchao Zhang 26989566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2699c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2700cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 27019566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2702c8378d12SStefano Zampini } else { 2703cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 27049566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2705c8378d12SStefano Zampini } 2706c8378d12SStefano Zampini 27079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2708afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2709afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2710fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2711fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2712fe5544b9SJunchao Zhang #else 2713fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr; 2714fe5544b9SJunchao Zhang #endif 2715fe5544b9SJunchao Zhang 2716a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2717afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2718fcdce8c4SStefano Zampini size_t mmBufferSize; 27199371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 27209371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 27219371c9d4SSatish Balay mmdata->matBDescr = NULL; 27229371c9d4SSatish Balay } 2723afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 27249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2725afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2726afb2bd1cSJunchao Zhang } 2727c8378d12SStefano Zampini 27289371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 27299371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 27309371c9d4SSatish Balay mmdata->matCDescr = NULL; 27319371c9d4SSatish Balay } 2732afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 27339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2734afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2735afb2bd1cSJunchao Zhang } 2736afb2bd1cSJunchao Zhang 2737fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2738fe5544b9SJunchao Zhang if (matADescr) { 273917f5f06fSJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2740fe5544b9SJunchao Zhang matADescr = NULL; 2741fe5544b9SJunchao Zhang } 2742fe5544b9SJunchao Zhang #endif 2743fe5544b9SJunchao Zhang 2744fe5544b9SJunchao Zhang if (!matADescr) { 2745fe5544b9SJunchao Zhang stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 27469371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 27479371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2748afb2bd1cSJunchao Zhang } 2749fe5544b9SJunchao Zhang 2750fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2751fe5544b9SJunchao Zhang 2752fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 27539566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 27549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2755fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2756fcdce8c4SStefano Zampini } 2757fe5544b9SJunchao Zhang 2758fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0 2759fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2760fe5544b9SJunchao Zhang #endif 2761fe5544b9SJunchao Zhang 2762afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2763afb2bd1cSJunchao Zhang } else { 2764afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2765fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 27669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 27679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2768afb2bd1cSJunchao Zhang } 2769afb2bd1cSJunchao Zhang 2770afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2771fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2772afb2bd1cSJunchao Zhang #else 2773afb2bd1cSJunchao Zhang PetscInt k; 2774afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2775ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2776ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2777ccdfe979SStefano Zampini cublasStatus_t cerr; 2778ccdfe979SStefano Zampini 27799566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 27809371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 27819371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2782ccdfe979SStefano Zampini blda = B->cmap->n; 2783afb2bd1cSJunchao Zhang k = B->cmap->n; 2784afb2bd1cSJunchao Zhang } else { 2785afb2bd1cSJunchao Zhang k = B->rmap->n; 2786ccdfe979SStefano Zampini } 2787ccdfe979SStefano Zampini 2788afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 27899371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 27909371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2791afb2bd1cSJunchao Zhang #endif 27929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 27939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2794cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2795ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2796cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 27974742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2798ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2799cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 28004742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2801ccdfe979SStefano Zampini } else { 2802cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2803ccdfe979SStefano Zampini } 280448a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 280548a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 28063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2807ccdfe979SStefano Zampini } 2808ccdfe979SStefano Zampini 2809d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2810d71ae5a4SJacob Faibussowitsch { 2811ccdfe979SStefano Zampini Mat_Product *product = C->product; 2812ccdfe979SStefano Zampini Mat A, B; 2813ccdfe979SStefano Zampini PetscInt m, n; 2814ccdfe979SStefano Zampini PetscBool cisdense, flg; 2815ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2816ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2817ccdfe979SStefano Zampini 2818ccdfe979SStefano Zampini PetscFunctionBegin; 2819ccdfe979SStefano Zampini MatCheckProduct(C, 1); 282028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2821ccdfe979SStefano Zampini A = product->A; 2822ccdfe979SStefano Zampini B = product->B; 28239566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 282428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2825ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 282608401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2827ccdfe979SStefano Zampini switch (product->type) { 2828ccdfe979SStefano Zampini case MATPRODUCT_AB: 2829ccdfe979SStefano Zampini m = A->rmap->n; 2830ccdfe979SStefano Zampini n = B->cmap->n; 28310e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2832ccdfe979SStefano Zampini break; 2833ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2834ccdfe979SStefano Zampini m = A->cmap->n; 2835ccdfe979SStefano Zampini n = B->cmap->n; 28360e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 28370e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2838ccdfe979SStefano Zampini break; 2839ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2840ccdfe979SStefano Zampini m = A->rmap->n; 2841ccdfe979SStefano Zampini n = B->rmap->n; 28420e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 28430e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2844ccdfe979SStefano Zampini break; 2845ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2846ccdfe979SStefano Zampini m = B->cmap->n; 2847ccdfe979SStefano Zampini n = B->cmap->n; 28480e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 28490e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2850ccdfe979SStefano Zampini break; 2851ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2852ccdfe979SStefano Zampini m = B->rmap->n; 2853ccdfe979SStefano Zampini n = B->rmap->n; 28540e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 28550e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2856ccdfe979SStefano Zampini break; 2857d71ae5a4SJacob Faibussowitsch default: 2858d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2859ccdfe979SStefano Zampini } 28609566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2861ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 28629566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 28639566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2864ccdfe979SStefano Zampini 2865ccdfe979SStefano Zampini /* product data */ 28669566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2867ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2868afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2869afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 287048a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2871afb2bd1cSJunchao Zhang #endif 2872ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2873ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 28749566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 28759566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2876ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 28779566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2878ccdfe979SStefano Zampini } else { 28799566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2880ccdfe979SStefano Zampini } 2881ccdfe979SStefano Zampini } 2882ccdfe979SStefano Zampini C->product->data = mmdata; 2883ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2884ccdfe979SStefano Zampini 2885ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 28863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2887ccdfe979SStefano Zampini } 2888ccdfe979SStefano Zampini 2889d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2890d71ae5a4SJacob Faibussowitsch { 2891ccdfe979SStefano Zampini Mat_Product *product = C->product; 2892fcdce8c4SStefano Zampini Mat A, B; 2893fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2894fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2895fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2896fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2897fcdce8c4SStefano Zampini PetscBool flg; 2898fcdce8c4SStefano Zampini cusparseStatus_t stat; 2899fcdce8c4SStefano Zampini MatProductType ptype; 2900fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2901fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2902fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2903fcdce8c4SStefano Zampini #endif 2904b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2905ccdfe979SStefano Zampini 2906ccdfe979SStefano Zampini PetscFunctionBegin; 2907ccdfe979SStefano Zampini MatCheckProduct(C, 1); 290828b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 29099566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 291028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2911fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2912fcdce8c4SStefano Zampini A = product->A; 2913fcdce8c4SStefano Zampini B = product->B; 2914fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2915fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2916fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 291708401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2918fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 291928b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2920fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 292128b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2922fcdce8c4SStefano Zampini goto finalize; 2923fcdce8c4SStefano Zampini } 2924fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 29259566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 292628b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 29279566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 292828b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 292928b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 293028b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2931fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2932fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2933fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 293408401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 293508401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 293608401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 29379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 29389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2939fcdce8c4SStefano Zampini 2940fcdce8c4SStefano Zampini ptype = product->type; 2941b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2942fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 294328b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2944fa046f9fSJunchao Zhang } 2945b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2946fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 294728b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2948fa046f9fSJunchao Zhang } 2949fcdce8c4SStefano Zampini switch (ptype) { 2950fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2951fcdce8c4SStefano Zampini Amat = Acusp->mat; 2952fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2953fcdce8c4SStefano Zampini break; 2954fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2955fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2956fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2957fcdce8c4SStefano Zampini break; 2958fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2959fcdce8c4SStefano Zampini Amat = Acusp->mat; 2960fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2961fcdce8c4SStefano Zampini break; 2962d71ae5a4SJacob Faibussowitsch default: 2963d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2964fcdce8c4SStefano Zampini } 2965fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 296628b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 296728b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 296828b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2969fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2970fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2971fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 297228b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 297328b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 297428b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 29759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2976fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2977fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 29789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2979b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 29809371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2982b4285af6SJunchao Zhang #else 29839371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 29849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29859371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29869371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2987b4285af6SJunchao Zhang #endif 2988fcdce8c4SStefano Zampini #else 29899371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 29909371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 29919371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2992fcdce8c4SStefano Zampini #endif 29939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 29949566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 29959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2996fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2997fcdce8c4SStefano Zampini finalize: 2998fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 29999566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 30009566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 30019566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3002fcdce8c4SStefano Zampini c->reallocs = 0; 3003fcdce8c4SStefano Zampini C->info.mallocs += 0; 3004fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 3005fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 3006fcdce8c4SStefano Zampini C->num_ass++; 30073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3008ccdfe979SStefano Zampini } 3009fcdce8c4SStefano Zampini 3010d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3011d71ae5a4SJacob Faibussowitsch { 3012fcdce8c4SStefano Zampini Mat_Product *product = C->product; 3013fcdce8c4SStefano Zampini Mat A, B; 3014fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3015fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 3016fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3017fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 3018fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 3019fcdce8c4SStefano Zampini PetscBool flg; 3020fcdce8c4SStefano Zampini cusparseStatus_t stat; 3021fcdce8c4SStefano Zampini MatProductType ptype; 3022fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3023fcdce8c4SStefano Zampini PetscLogDouble flops; 3024fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 3025fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3026fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3027fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3028fcdce8c4SStefano Zampini #else 3029fcdce8c4SStefano Zampini int cnz; 3030fcdce8c4SStefano Zampini #endif 3031b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3032fcdce8c4SStefano Zampini 3033fcdce8c4SStefano Zampini PetscFunctionBegin; 3034fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 303528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3036fcdce8c4SStefano Zampini A = product->A; 3037fcdce8c4SStefano Zampini B = product->B; 30389566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 303928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 30409566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 304128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3042fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 3043fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 3044fcdce8c4SStefano Zampini /* product data */ 30459566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3046fcdce8c4SStefano Zampini C->product->data = mmdata; 3047fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3048fcdce8c4SStefano Zampini 30499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 30509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3051d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3052d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 305308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 305408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3055d60bce21SJunchao Zhang 3056fcdce8c4SStefano Zampini ptype = product->type; 3057b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3058fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3059fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3060fa046f9fSJunchao Zhang } 3061b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3062fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3063fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3064fa046f9fSJunchao Zhang } 3065fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3066fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3067fcdce8c4SStefano Zampini switch (ptype) { 3068fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3069fcdce8c4SStefano Zampini m = A->rmap->n; 3070fcdce8c4SStefano Zampini n = B->cmap->n; 3071fcdce8c4SStefano Zampini k = A->cmap->n; 3072fcdce8c4SStefano Zampini Amat = Acusp->mat; 3073fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3074fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3075fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3076fcdce8c4SStefano Zampini break; 3077fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3078fcdce8c4SStefano Zampini m = A->cmap->n; 3079fcdce8c4SStefano Zampini n = B->cmap->n; 3080fcdce8c4SStefano Zampini k = A->rmap->n; 30819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3082fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3083fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3084fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3085fcdce8c4SStefano Zampini break; 3086fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3087fcdce8c4SStefano Zampini m = A->rmap->n; 3088fcdce8c4SStefano Zampini n = B->rmap->n; 3089fcdce8c4SStefano Zampini k = A->cmap->n; 30909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3091fcdce8c4SStefano Zampini Amat = Acusp->mat; 3092fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3093fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3094fcdce8c4SStefano Zampini break; 3095d71ae5a4SJacob Faibussowitsch default: 3096d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3097fcdce8c4SStefano Zampini } 3098fcdce8c4SStefano Zampini 3099fcdce8c4SStefano Zampini /* create cusparse matrix */ 31009566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 31019566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3102fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 3103fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3104fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3105fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3106fcdce8c4SStefano Zampini 3107fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3108fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3109fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 31109566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 31119566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3112fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3113fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3114fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3115fcdce8c4SStefano Zampini } else { 3116fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3117fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3118fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3119fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3120fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3121fcdce8c4SStefano Zampini } 3122fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3123fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3124fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3125fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3126fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3127fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 31289566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 31299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 31309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3131f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3132f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3133f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 31349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3137fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3138d460d7bfSJunchao Zhang PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3139fcdce8c4SStefano Zampini c->nz = 0; 3140fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3141fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3142fcdce8c4SStefano Zampini goto finalizesym; 3143fcdce8c4SStefano Zampini } 3144fcdce8c4SStefano Zampini 314528b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 314628b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3147fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 3148fcdce8c4SStefano Zampini if (!biscompressed) { 3149fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 3150fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3151fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3152fcdce8c4SStefano Zampini #endif 3153fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3154fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3155fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3156fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3157fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3158fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3159fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3160fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3161fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3162fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3163fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 31649566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3165fcdce8c4SStefano Zampini } 3166fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3167fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3168fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3169fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 31709371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 31719371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3172fcdce8c4SStefano Zampini } 3173fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3174fcdce8c4SStefano Zampini #endif 3175fcdce8c4SStefano Zampini } 317628b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 317728b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3178fcdce8c4SStefano Zampini /* precompute flops count */ 3179fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3180fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3181fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3182fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 3183fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 3184fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3185fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 3186fcdce8c4SStefano Zampini } 3187fcdce8c4SStefano Zampini } 3188fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3189fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3190fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 3191fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3192fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 3193fcdce8c4SStefano Zampini } 3194fcdce8c4SStefano Zampini } else { /* TODO */ 3195fcdce8c4SStefano Zampini flops = 0.; 3196fcdce8c4SStefano Zampini } 3197fcdce8c4SStefano Zampini 3198fcdce8c4SStefano Zampini mmdata->flops = flops; 31999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3200b4285af6SJunchao Zhang 3201fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 32029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 32031ffab3bdSJunchao Zhang // cuda-12.2 requires non-null csrRowOffsets 32041ffab3bdSJunchao Zhang stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 32059371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3207b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3208b4285af6SJunchao Zhang { 3209b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3210b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3211b4285af6SJunchao Zhang */ 3212b4285af6SJunchao Zhang void *dBuffer1 = NULL; 3213b4285af6SJunchao Zhang void *dBuffer2 = NULL; 3214b4285af6SJunchao Zhang void *dBuffer3 = NULL; 3215b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3216b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3217b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3218b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3219b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3220b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3221b4285af6SJunchao Zhang 3222b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 32239371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 32249371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3226b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 32279371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 32289371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3229b4285af6SJunchao Zhang 32309371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 32319371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 32339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 32349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 32359371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 32369371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 32389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3239b4285af6SJunchao Zhang 3240b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 32419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3242b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 3243b4285af6SJunchao Zhang /* allocate matrix C */ 32449371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32459371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32469371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 32479371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3248b4285af6SJunchao Zhang /* update matC with the new pointers */ 32499371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32509371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3251b4285af6SJunchao Zhang 32529371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 32539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 32559371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 32569371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32579566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 32589371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32599371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32609566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3261b4285af6SJunchao Zhang } 3262ae37ee31SJunchao Zhang #else 3263b4285af6SJunchao Zhang size_t bufSize2; 3264fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 32659371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 32669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3268fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 32699371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 32709371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3271fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 32729371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 32739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3274fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3275fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3276fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3277fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3278fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 32799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3280fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 32819371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 32829371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3283fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 32849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3285fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 32869371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 32879371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3288fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32899566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3290fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 32919566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32929371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32939371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32949371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32959371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3296ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3297fcdce8c4SStefano Zampini #else 32989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 32999371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33009371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 33019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3302fcdce8c4SStefano Zampini c->nz = cnz; 3303fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 33049566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3305fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 33069566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3307fcdce8c4SStefano Zampini 33089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3309fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3310fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3311fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 33129371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33139371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 33149371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3315fcdce8c4SStefano Zampini #endif 33169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 33179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3318fcdce8c4SStefano Zampini finalizesym: 3319fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 33209f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 33219f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3322fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 33237de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3324fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3325fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3326fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3327fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3328fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3329fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3332fcdce8c4SStefano Zampini } else { 3333fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3334fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3337fcdce8c4SStefano Zampini } 3338fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3339fcdce8c4SStefano Zampini PetscInt r = 0; 3340fcdce8c4SStefano Zampini c->i[0] = 0; 3341fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3342fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3343fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3344fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3345fcdce8c4SStefano Zampini } 3346fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3347fcdce8c4SStefano Zampini } 33489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 33499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 33509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3351fcdce8c4SStefano Zampini c->maxnz = c->nz; 3352fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3353fcdce8c4SStefano Zampini c->rmax = 0; 3354fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3355fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3356fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3357fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3358fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3359fcdce8c4SStefano Zampini } 33609566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 33619566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3362fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3363fcdce8c4SStefano Zampini 3364fcdce8c4SStefano Zampini C->nonzerostate++; 33659566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 33669566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3367fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3368fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3369fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3370fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3371fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3372abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3373fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3374fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3375fcdce8c4SStefano Zampini } 3376fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 33773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3378fcdce8c4SStefano Zampini } 3379fcdce8c4SStefano Zampini 3380fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3381fcdce8c4SStefano Zampini 3382fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3384d71ae5a4SJacob Faibussowitsch { 3385fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3386fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3387fcdce8c4SStefano Zampini 3388fcdce8c4SStefano Zampini PetscFunctionBegin; 3389fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 33909566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 339148a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3392fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3393fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 339448a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3395fcdce8c4SStefano Zampini } 339665e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 339765e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 339865e4b4d4SStefano Zampini switch (product->type) { 339965e4b4d4SStefano Zampini case MATPRODUCT_AB: 340065e4b4d4SStefano Zampini if (product->api_user) { 3401d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 34029566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3403d0609cedSBarry Smith PetscOptionsEnd(); 340465e4b4d4SStefano Zampini } else { 3405d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 34069566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3407d0609cedSBarry Smith PetscOptionsEnd(); 340865e4b4d4SStefano Zampini } 340965e4b4d4SStefano Zampini break; 341065e4b4d4SStefano Zampini case MATPRODUCT_AtB: 341165e4b4d4SStefano Zampini if (product->api_user) { 3412d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 34139566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3414d0609cedSBarry Smith PetscOptionsEnd(); 341565e4b4d4SStefano Zampini } else { 3416d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 34179566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3418d0609cedSBarry Smith PetscOptionsEnd(); 341965e4b4d4SStefano Zampini } 342065e4b4d4SStefano Zampini break; 342165e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 342265e4b4d4SStefano Zampini if (product->api_user) { 3423d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 34249566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3425d0609cedSBarry Smith PetscOptionsEnd(); 342665e4b4d4SStefano Zampini } else { 3427d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 34289566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3429d0609cedSBarry Smith PetscOptionsEnd(); 343065e4b4d4SStefano Zampini } 343165e4b4d4SStefano Zampini break; 343265e4b4d4SStefano Zampini case MATPRODUCT_RARt: 343365e4b4d4SStefano Zampini if (product->api_user) { 3434d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 34359566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3436d0609cedSBarry Smith PetscOptionsEnd(); 343765e4b4d4SStefano Zampini } else { 3438d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 34399566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3440d0609cedSBarry Smith PetscOptionsEnd(); 344165e4b4d4SStefano Zampini } 344265e4b4d4SStefano Zampini break; 344365e4b4d4SStefano Zampini case MATPRODUCT_ABC: 344465e4b4d4SStefano Zampini if (product->api_user) { 3445d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 34469566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3447d0609cedSBarry Smith PetscOptionsEnd(); 344865e4b4d4SStefano Zampini } else { 3449d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 34509566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3451d0609cedSBarry Smith PetscOptionsEnd(); 345265e4b4d4SStefano Zampini } 345365e4b4d4SStefano Zampini break; 3454d71ae5a4SJacob Faibussowitsch default: 3455d71ae5a4SJacob Faibussowitsch break; 345665e4b4d4SStefano Zampini } 345765e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 345865e4b4d4SStefano Zampini } 345965e4b4d4SStefano Zampini /* dispatch */ 3460fcdce8c4SStefano Zampini if (isdense) { 3461ccdfe979SStefano Zampini switch (product->type) { 3462ccdfe979SStefano Zampini case MATPRODUCT_AB: 3463ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3464ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3465ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3466ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3467fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 34689566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3469fcdce8c4SStefano Zampini } else { 3470fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3471fcdce8c4SStefano Zampini } 3472fcdce8c4SStefano Zampini break; 3473d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3474d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3475d71ae5a4SJacob Faibussowitsch break; 3476d71ae5a4SJacob Faibussowitsch default: 3477d71ae5a4SJacob Faibussowitsch break; 3478ccdfe979SStefano Zampini } 3479fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3480fcdce8c4SStefano Zampini switch (product->type) { 3481fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3482fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3483d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 3484d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3485d71ae5a4SJacob Faibussowitsch break; 3486fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3487fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3488d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3489d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3490d71ae5a4SJacob Faibussowitsch break; 3491d71ae5a4SJacob Faibussowitsch default: 3492d71ae5a4SJacob Faibussowitsch break; 3493fcdce8c4SStefano Zampini } 3494fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 34959566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3496fcdce8c4SStefano Zampini } 34973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3498ccdfe979SStefano Zampini } 3499ccdfe979SStefano Zampini 3500d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3501d71ae5a4SJacob Faibussowitsch { 35029ae82921SPaul Mullowney PetscFunctionBegin; 35039566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 35043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3505e6e9a74fSStefano Zampini } 3506e6e9a74fSStefano Zampini 3507d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3508d71ae5a4SJacob Faibussowitsch { 3509e6e9a74fSStefano Zampini PetscFunctionBegin; 35109566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 35113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3512e6e9a74fSStefano Zampini } 3513e6e9a74fSStefano Zampini 3514d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3515d71ae5a4SJacob Faibussowitsch { 3516e6e9a74fSStefano Zampini PetscFunctionBegin; 35179566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 35183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3519e6e9a74fSStefano Zampini } 3520e6e9a74fSStefano Zampini 3521d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3522d71ae5a4SJacob Faibussowitsch { 3523e6e9a74fSStefano Zampini PetscFunctionBegin; 35249566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 35253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 35269ae82921SPaul Mullowney } 35279ae82921SPaul Mullowney 3528d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3529d71ae5a4SJacob Faibussowitsch { 3530ca45077fSPaul Mullowney PetscFunctionBegin; 35319566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 35323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3533ca45077fSPaul Mullowney } 3534ca45077fSPaul Mullowney 3535d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3536d71ae5a4SJacob Faibussowitsch { 3537a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3538a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3539a0e72f99SJunchao Zhang } 3540a0e72f99SJunchao Zhang 3541afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3542d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3543d71ae5a4SJacob Faibussowitsch { 35449ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3545aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 35469ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3547e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3548e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3549e6e9a74fSStefano Zampini PetscBool compressed; 3550afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3551afb2bd1cSJunchao Zhang PetscInt nx, ny; 3552afb2bd1cSJunchao Zhang #endif 35536e111a19SKarl Rupp 35549ae82921SPaul Mullowney PetscFunctionBegin; 355508401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3556cbc6b225SStefano Zampini if (!a->nz) { 3557995bce04SJacob Faibussowitsch if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3558995bce04SJacob Faibussowitsch else PetscCall(VecSeq_CUDA::Set(zz, 0)); 35593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3560e6e9a74fSStefano Zampini } 356134d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 35629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3563e6e9a74fSStefano Zampini if (!trans) { 35649ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 35655f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3566e6e9a74fSStefano Zampini } else { 35671a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3568e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3569e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3570e6e9a74fSStefano Zampini } else { 35719566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3572e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3573e6e9a74fSStefano Zampini } 3574e6e9a74fSStefano Zampini } 3575e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3576e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3577213423ffSJunchao Zhang 3578e6e9a74fSStefano Zampini try { 35799566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 358069d47153SPierre Jolivet if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 35819566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3582afb2bd1cSJunchao Zhang 35839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3584e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3585afb2bd1cSJunchao Zhang /* z = A x + beta y. 3586afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3587afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3588afb2bd1cSJunchao Zhang */ 3589e6e9a74fSStefano Zampini xptr = xarray; 3590afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3591213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3592afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3593afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3594afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3595afb2bd1cSJunchao Zhang */ 3596afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3597afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3598fe5544b9SJunchao Zhang nx = mat->num_cols; // since y = Ax 3599afb2bd1cSJunchao Zhang ny = mat->num_rows; 3600afb2bd1cSJunchao Zhang } 3601afb2bd1cSJunchao Zhang #endif 3602e6e9a74fSStefano Zampini } else { 3603afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3604afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3605afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3606afb2bd1cSJunchao Zhang */ 3607afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3608e6e9a74fSStefano Zampini dptr = zarray; 3609e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3610afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3611e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3612d0967f54SJacob Faibussowitsch 3613d0967f54SJacob Faibussowitsch thrust::for_each( 3614d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3615d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3616d0967f54SJacob Faibussowitsch #endif 3617d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 36189371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3619e6e9a74fSStefano Zampini } 3620afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3621afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3622afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3623fe5544b9SJunchao Zhang nx = mat->num_rows; // since y = A^T x 3624afb2bd1cSJunchao Zhang ny = mat->num_cols; 3625afb2bd1cSJunchao Zhang } 3626afb2bd1cSJunchao Zhang #endif 3627e6e9a74fSStefano Zampini } 36289ae82921SPaul Mullowney 3629afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3630aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3631afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3632fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3633fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3634fe5544b9SJunchao Zhang #else 3635fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3636fe5544b9SJunchao Zhang #endif 3637fe5544b9SJunchao Zhang 36385f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3639fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3640fe5544b9SJunchao Zhang if (!matDescr) { 3641fe5544b9SJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3642fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3643fe5544b9SJunchao Zhang } 3644fe5544b9SJunchao Zhang #endif 3645fe5544b9SJunchao Zhang 3646afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 36479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 36489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 36499371c9d4SSatish Balay PetscCallCUSPARSE( 3650fe5544b9SJunchao Zhang cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 36519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3652fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3653fe5544b9SJunchao Zhang PetscCallCUSPARSE( 3654fe5544b9SJunchao Zhang cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3655fe5544b9SJunchao Zhang #endif 3656afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3657afb2bd1cSJunchao Zhang } else { 3658afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 36599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 36609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3661afb2bd1cSJunchao Zhang } 3662afb2bd1cSJunchao Zhang 3663fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3664afb2bd1cSJunchao Zhang #else 36657656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 36669371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3667afb2bd1cSJunchao Zhang #endif 3668aa372e3fSPaul Mullowney } else { 3669213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3670afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3671afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3672afb2bd1cSJunchao Zhang #else 3673301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 36749371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3675afb2bd1cSJunchao Zhang #endif 3676a65300a6SPaul Mullowney } 3677aa372e3fSPaul Mullowney } 36789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3679aa372e3fSPaul Mullowney 3680e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3681213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3682213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3683995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3684e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3685995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 36867656d835SStefano Zampini } 3687213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3688995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Set(zz, 0)); 36897656d835SStefano Zampini } 36907656d835SStefano Zampini 3691213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3692213423ffSJunchao Zhang if (compressed) { 36939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3694da81f932SPierre Jolivet /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3695a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3696a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3697a0e72f99SJunchao Zhang */ 3698a0e72f99SJunchao Zhang #if 0 3699a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3700a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3701a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3702e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3703c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3704a0e72f99SJunchao Zhang #else 37056497c311SBarry Smith PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 37066497c311SBarry Smith ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3707a0e72f99SJunchao Zhang #endif 37089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3709e6e9a74fSStefano Zampini } 3710e6e9a74fSStefano Zampini } else { 3711995bce04SJacob Faibussowitsch if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3712e6e9a74fSStefano Zampini } 37139566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 37149566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 37159566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3716d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 3717d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3718d71ae5a4SJacob Faibussowitsch } 3719e6e9a74fSStefano Zampini if (yy) { 37209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3721e6e9a74fSStefano Zampini } else { 37229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3723e6e9a74fSStefano Zampini } 37243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37259ae82921SPaul Mullowney } 37269ae82921SPaul Mullowney 3727d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3728d71ae5a4SJacob Faibussowitsch { 3729ca45077fSPaul Mullowney PetscFunctionBegin; 37309566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 37313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3732ca45077fSPaul Mullowney } 3733ca45077fSPaul Mullowney 3734d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3735d71ae5a4SJacob Faibussowitsch { 3736042217e8SBarry Smith PetscFunctionBegin; 37379566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 37383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37399ae82921SPaul Mullowney } 37409ae82921SPaul Mullowney 3741e057df02SPaul Mullowney /*@ 374211a5261eSBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 37432920cce0SJacob Faibussowitsch (the default parallel PETSc format). 37449ae82921SPaul Mullowney 3745d083f849SBarry Smith Collective 37469ae82921SPaul Mullowney 37479ae82921SPaul Mullowney Input Parameters: 374811a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 37499ae82921SPaul Mullowney . m - number of rows 37509ae82921SPaul Mullowney . n - number of columns 375120f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 375220f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 37539ae82921SPaul Mullowney 37549ae82921SPaul Mullowney Output Parameter: 37559ae82921SPaul Mullowney . A - the matrix 37569ae82921SPaul Mullowney 37572ef1f0ffSBarry Smith Level: intermediate 37582ef1f0ffSBarry Smith 37592ef1f0ffSBarry Smith Notes: 37602920cce0SJacob Faibussowitsch This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 37612920cce0SJacob Faibussowitsch calculations. For good matrix assembly performance the user should preallocate the matrix 37622920cce0SJacob Faibussowitsch storage by setting the parameter `nz` (or the array `nnz`). 37632920cce0SJacob Faibussowitsch 376411a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 37659ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 376611a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 37679ae82921SPaul Mullowney 376811a5261eSBarry Smith The AIJ format, also called 37692ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 37709ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 377120f4b53cSBarry Smith either one (as in Fortran) or zero. 37729ae82921SPaul Mullowney 37739ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 37742ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 377520f4b53cSBarry Smith allocation. 37769ae82921SPaul Mullowney 3777fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE` 37789ae82921SPaul Mullowney @*/ 3779d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3780d71ae5a4SJacob Faibussowitsch { 37819ae82921SPaul Mullowney PetscFunctionBegin; 37829566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 37839566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 37849566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 37859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 37863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37879ae82921SPaul Mullowney } 37889ae82921SPaul Mullowney 3789d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3790d71ae5a4SJacob Faibussowitsch { 37919ae82921SPaul Mullowney PetscFunctionBegin; 37929ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 37932c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 37949ae82921SPaul Mullowney } else { 37959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3796aa372e3fSPaul Mullowney } 37979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 37989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 37999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 38009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 38019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 38029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 38039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 38049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 38059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 38069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 38079566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 38083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38099ae82921SPaul Mullowney } 38109ae82921SPaul Mullowney 3811ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 381295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3813d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3814d71ae5a4SJacob Faibussowitsch { 38159ff858a8SKarl Rupp PetscFunctionBegin; 38169566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 38179566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 38183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38199ff858a8SKarl Rupp } 38209ff858a8SKarl Rupp 3821d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3822d71ae5a4SJacob Faibussowitsch { 3823a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3824039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3825039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3826039c6fbaSStefano Zampini PetscScalar *ay; 3827039c6fbaSStefano Zampini const PetscScalar *ax; 3828039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3829e6e9a74fSStefano Zampini 383095639643SRichard Tran Mills PetscFunctionBegin; 3831a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3832a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3833039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 38349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 38359566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 38363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 383795639643SRichard Tran Mills } 3838039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 38399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 38409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 38415f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 38425f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3843039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3844039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3845039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3846039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3847039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3848ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3849039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3850039c6fbaSStefano Zampini } 3851d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3852d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3853039c6fbaSStefano Zampini 3854039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3855039c6fbaSStefano Zampini PetscScalar b = 1.0; 3856039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3857039c6fbaSStefano Zampini size_t bufferSize; 3858039c6fbaSStefano Zampini void *buffer; 3859039c6fbaSStefano Zampini #endif 3860039c6fbaSStefano Zampini 38619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3864039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 38659371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38669371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 38679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 38689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38699371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38709371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 38719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3874039c6fbaSStefano Zampini #else 38759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38769371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38779371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 38789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3880039c6fbaSStefano Zampini #endif 38819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 38829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3885039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3886a587d139SMark cublasHandle_t cublasv2handle; 3887a587d139SMark PetscBLASInt one = 1, bnz = 1; 3888039c6fbaSStefano Zampini 38899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38919566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 38929566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 38939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38949566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 38959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 38969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3900039c6fbaSStefano Zampini } else { 39019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 39029566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3903a587d139SMark } 39043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 390595639643SRichard Tran Mills } 390695639643SRichard Tran Mills 3907d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3908d71ae5a4SJacob Faibussowitsch { 390933c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 391033c9ba73SStefano Zampini PetscScalar *ay; 391133c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 391233c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 391333c9ba73SStefano Zampini 391433c9ba73SStefano Zampini PetscFunctionBegin; 39159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 39169566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 39179566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 39189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39199566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 39209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 39219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 39239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 39243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 392533c9ba73SStefano Zampini } 392633c9ba73SStefano Zampini 3927d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3928d71ae5a4SJacob Faibussowitsch { 39297e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3930a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 39317e8381f9SStefano Zampini 39323fa6b06aSMark Adams PetscFunctionBegin; 39333fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 39343fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 39357e8381f9SStefano Zampini if (spptr->mat) { 39367e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 39377e8381f9SStefano Zampini if (matrix->values) { 39387e8381f9SStefano Zampini both = PETSC_TRUE; 39397e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39407e8381f9SStefano Zampini } 39417e8381f9SStefano Zampini } 39427e8381f9SStefano Zampini if (spptr->matTranspose) { 39437e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3944ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39457e8381f9SStefano Zampini } 39463fa6b06aSMark Adams } 39479566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 39489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 39497e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3950a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 39513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39523fa6b06aSMark Adams } 39533fa6b06aSMark Adams 3954d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3955d71ae5a4SJacob Faibussowitsch { 3956a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3957a587d139SMark 3958a587d139SMark PetscFunctionBegin; 39599a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 39609a14fc28SStefano Zampini A->boundtocpu = flg; 39613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39629a14fc28SStefano Zampini } 3963a587d139SMark if (flg) { 39649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3965a587d139SMark 396633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3967a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3968a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3969a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3970a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3971a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3972a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3973a587d139SMark A->ops->multhermitiantranspose = NULL; 3974a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3975fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 39769566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 39779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 39789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 39799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 39809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 39819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 39829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3983a587d139SMark } else { 398433c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3985a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3986a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3987a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3988a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3989a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3990a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3991a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3992a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3993fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 399467a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 399567a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 399667a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 399767a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 399867a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 399967a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 40007ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 40017ee59b9bSJunchao Zhang 40029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 40039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 40069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 40079566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4008a587d139SMark } 4009a587d139SMark A->boundtocpu = flg; 4010ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 4011ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 4012ea500dcfSRichard Tran Mills } else { 4013ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 4014ea500dcfSRichard Tran Mills } 40153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4016a587d139SMark } 4017a587d139SMark 40188eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4019d71ae5a4SJacob Faibussowitsch { 402049735bf3SStefano Zampini Mat B; 40219ae82921SPaul Mullowney 40229ae82921SPaul Mullowney PetscFunctionBegin; 40239566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 402449735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 40259566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 402649735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 40279566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 402849735bf3SStefano Zampini } 402949735bf3SStefano Zampini B = *newmat; 403049735bf3SStefano Zampini 40319566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 40329566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 403334136279SStefano Zampini 403449735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 40359ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4036e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 40379566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 40401a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4041d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4042b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4043a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4044a435da06SStefano Zampini #else 4045d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4046a435da06SStefano Zampini #endif 4047d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4048d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4049d8132acaSStefano Zampini #endif 40501a2c6b5cSJunchao Zhang B->spptr = spptr; 40519ae82921SPaul Mullowney } else { 4052e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4053e6e9a74fSStefano Zampini 40549566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4057e6e9a74fSStefano Zampini B->spptr = spptr; 40589ae82921SPaul Mullowney } 4059e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 406049735bf3SStefano Zampini } 4061693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 40629ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 40631a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 40649ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 406595639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4066693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 40672205254eSKarl Rupp 40689566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 40699566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 40709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4071ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 40729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4073ae48a8d0SStefano Zampini #endif 40749566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 40753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40769ae82921SPaul Mullowney } 40779ae82921SPaul Mullowney 4078d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4079d71ae5a4SJacob Faibussowitsch { 408002fe1965SBarry Smith PetscFunctionBegin; 40819566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 40829566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 40833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 408402fe1965SBarry Smith } 408502fe1965SBarry Smith 40863ca39a21SBarry Smith /*MC 4087e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4088e057df02SPaul Mullowney 408915229ffcSPierre Jolivet A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either 409011a5261eSBarry Smith CSR, ELL, or Hybrid format. 409111a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4092e057df02SPaul Mullowney 4093e057df02SPaul Mullowney Options Database Keys: 409411a5261eSBarry Smith + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 40952ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 40962ef1f0ffSBarry Smith Other options include ell (ellpack) or hyb (hybrid). 40972ef1f0ffSBarry Smith . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 40982ef1f0ffSBarry Smith - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4099e057df02SPaul Mullowney 4100e057df02SPaul Mullowney Level: beginner 4101e057df02SPaul Mullowney 41021cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4103e057df02SPaul Mullowney M*/ 41047f756511SDominic Meiser 4105d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4106d71ae5a4SJacob Faibussowitsch { 410742c9c57cSBarry Smith PetscFunctionBegin; 41089566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 41099566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 41109566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 41119566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 41123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 411342c9c57cSBarry Smith } 411429b38603SBarry Smith 41152c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4116d71ae5a4SJacob Faibussowitsch { 41172c4ab24aSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4118cbc6b225SStefano Zampini 4119cbc6b225SStefano Zampini PetscFunctionBegin; 41202c4ab24aSJunchao Zhang if (cusp) { 41212c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 41222c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 41232c4ab24aSJunchao Zhang delete cusp->workVector; 41242c4ab24aSJunchao Zhang delete cusp->rowoffsets_gpu; 41252c4ab24aSJunchao Zhang delete cusp->csr2csc_i; 41262c4ab24aSJunchao Zhang delete cusp->coords; 41272c4ab24aSJunchao Zhang if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 41282c4ab24aSJunchao Zhang PetscCall(PetscFree(mat->spptr)); 41297f756511SDominic Meiser } 41303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41317f756511SDominic Meiser } 41327f756511SDominic Meiser 4133d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4134d71ae5a4SJacob Faibussowitsch { 41357f756511SDominic Meiser PetscFunctionBegin; 41367f756511SDominic Meiser if (*mat) { 41377f756511SDominic Meiser delete (*mat)->values; 41387f756511SDominic Meiser delete (*mat)->column_indices; 41397f756511SDominic Meiser delete (*mat)->row_offsets; 41407f756511SDominic Meiser delete *mat; 41417f756511SDominic Meiser *mat = 0; 41427f756511SDominic Meiser } 41433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41447f756511SDominic Meiser } 41457f756511SDominic Meiser 4146b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4147d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4148d71ae5a4SJacob Faibussowitsch { 41497f756511SDominic Meiser PetscFunctionBegin; 41507f756511SDominic Meiser if (*trifactor) { 41519566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4152261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 41539566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 41549566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 41559566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4156afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 41579566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4158afb2bd1cSJunchao Zhang #endif 41599566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 41607f756511SDominic Meiser } 41613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41627f756511SDominic Meiser } 4163d460d7bfSJunchao Zhang #endif 41647f756511SDominic Meiser 4165d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4166d71ae5a4SJacob Faibussowitsch { 41677f756511SDominic Meiser CsrMatrix *mat; 41687f756511SDominic Meiser 41697f756511SDominic Meiser PetscFunctionBegin; 41707f756511SDominic Meiser if (*matstruct) { 41717f756511SDominic Meiser if ((*matstruct)->mat) { 41727f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4173afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4174afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4175afb2bd1cSJunchao Zhang #else 41767f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 41779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4178afb2bd1cSJunchao Zhang #endif 41797f756511SDominic Meiser } else { 41807f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 41813ba16761SJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&mat)); 41827f756511SDominic Meiser } 41837f756511SDominic Meiser } 41849566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 41857f756511SDominic Meiser delete (*matstruct)->cprowIndices; 41869566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 41879566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 41889566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4189afb2bd1cSJunchao Zhang 4190afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4191afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 41929566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4193fe5544b9SJunchao Zhang 4194afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 4195afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 41969566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 41979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 41989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4199fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4200fe5544b9SJunchao Zhang if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4201fe5544b9SJunchao Zhang if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4202fe5544b9SJunchao Zhang #endif 4203afb2bd1cSJunchao Zhang } 4204afb2bd1cSJunchao Zhang } 4205afb2bd1cSJunchao Zhang #endif 42067f756511SDominic Meiser delete *matstruct; 42077e8381f9SStefano Zampini *matstruct = NULL; 42087f756511SDominic Meiser } 42093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42107f756511SDominic Meiser } 42117f756511SDominic Meiser 4212d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4213d71ae5a4SJacob Faibussowitsch { 4214da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4215da112707SJunchao Zhang 42167f756511SDominic Meiser PetscFunctionBegin; 4217da112707SJunchao Zhang if (fs) { 4218b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4219da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4220da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4221da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4222da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4223d460d7bfSJunchao Zhang delete fs->workVector; 4224d460d7bfSJunchao Zhang fs->workVector = NULL; 4225d460d7bfSJunchao Zhang #endif 4226da112707SJunchao Zhang delete fs->rpermIndices; 4227da112707SJunchao Zhang delete fs->cpermIndices; 4228da112707SJunchao Zhang fs->rpermIndices = NULL; 4229da112707SJunchao Zhang fs->cpermIndices = NULL; 4230da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4231b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4232da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4233da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 423430807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 423530807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4236da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4237d460d7bfSJunchao Zhang PetscCallCUDA(cudaFree(fs->diag)); 4238da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4239da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 424012ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4241da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4242da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 424312ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4244da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4245da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4246da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4247da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4248da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4249da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4250da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4251da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4252da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4253da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4254da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4255da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4256d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrRowPtr_h)); 4257d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrVal_h)); 4258d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->diag_h)); 425912ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 426012ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4261da112707SJunchao Zhang #endif 4262ccdfe979SStefano Zampini } 42633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4264ccdfe979SStefano Zampini } 4265ccdfe979SStefano Zampini 4266d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4267d71ae5a4SJacob Faibussowitsch { 4268ccdfe979SStefano Zampini PetscFunctionBegin; 4269ccdfe979SStefano Zampini if (*trifactors) { 42709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4271f0173cd6SStefano Zampini PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 42729566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 42737f756511SDominic Meiser } 42743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42757f756511SDominic Meiser } 42767e8381f9SStefano Zampini 42779371c9d4SSatish Balay struct IJCompare { 4278d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4279d71ae5a4SJacob Faibussowitsch { 42800b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 42810b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 42827e8381f9SStefano Zampini return false; 42837e8381f9SStefano Zampini } 42847e8381f9SStefano Zampini }; 42857e8381f9SStefano Zampini 428666976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4287d71ae5a4SJacob Faibussowitsch { 4288a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4289a49f1ed0SStefano Zampini 4290a49f1ed0SStefano Zampini PetscFunctionBegin; 4291a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 42923ba16761SJacob Faibussowitsch if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4293a49f1ed0SStefano Zampini if (destroy) { 42949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4295a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4296a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4297a49f1ed0SStefano Zampini } 42981a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 42993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4300a49f1ed0SStefano Zampini } 4301a49f1ed0SStefano Zampini 430249abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4303d71ae5a4SJacob Faibussowitsch { 430449abdd8aSBarry Smith MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 43054d86920dSPierre Jolivet 43067e8381f9SStefano Zampini PetscFunctionBegin; 43072c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->perm)); 43082c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->jmap)); 43092c4ab24aSJunchao Zhang PetscCall(PetscFree(coo)); 43103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43117e8381f9SStefano Zampini } 4312ed502f03SStefano Zampini 431366976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4314d71ae5a4SJacob Faibussowitsch { 43152c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE; 43162c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST; 43172c4ab24aSJunchao Zhang PetscInt *i, *j; 431803e76207SPierre Jolivet PetscContainer container_h; 43192c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4320219fbbafSJunchao Zhang 4321219fbbafSJunchao Zhang PetscFunctionBegin; 43222c4ab24aSJunchao Zhang // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter 43239566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 43242c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) { 43252c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE; 43262c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 43272c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43282c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43292c4ab24aSJunchao Zhang } else { 43302c4ab24aSJunchao Zhang i = coo_i; 43312c4ab24aSJunchao Zhang j = coo_j; 4332219fbbafSJunchao Zhang } 4333219fbbafSJunchao Zhang 43342c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 43352c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j)); 4336cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 43372c4ab24aSJunchao Zhang // Create the GPU memory 43389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 43392c4ab24aSJunchao Zhang 43402c4ab24aSJunchao Zhang // Copy the COO struct to device 43412c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 43422c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 43432c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d)); 43442c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 43452c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 43462c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43472c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 43482c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43492c4ab24aSJunchao Zhang 43502c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 435103e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 43523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4353219fbbafSJunchao Zhang } 4354219fbbafSJunchao Zhang 4355d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4356d71ae5a4SJacob Faibussowitsch { 4357219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4358219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4359b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4360b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4361b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4362b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4363b6c38306SJunchao Zhang } 4364219fbbafSJunchao Zhang } 4365219fbbafSJunchao Zhang 436666976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4367d71ae5a4SJacob Faibussowitsch { 4368219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4369219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4370219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4371219fbbafSJunchao Zhang PetscMemType memtype; 4372219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4373219fbbafSJunchao Zhang PetscScalar *Aa; 43742c4ab24aSJunchao Zhang PetscContainer container; 43752c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo; 4376219fbbafSJunchao Zhang 4377219fbbafSJunchao Zhang PetscFunctionBegin; 43782c4ab24aSJunchao Zhang if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 43792c4ab24aSJunchao Zhang 43802c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 43812c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 43822c4ab24aSJunchao Zhang 43839566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4384219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 43852c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 43862c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4387219fbbafSJunchao Zhang } 4388219fbbafSJunchao Zhang 43899566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 43909566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4391219fbbafSJunchao Zhang 439208bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 4393cbc6b225SStefano Zampini if (Annz) { 43946497c311SBarry Smith MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 43959566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4396cbc6b225SStefano Zampini } 439708bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 4398219fbbafSJunchao Zhang 43999566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 44009566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4401219fbbafSJunchao Zhang 44029566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 44033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4404219fbbafSJunchao Zhang } 4405219fbbafSJunchao Zhang 44065b7e41feSStefano Zampini /*@C 44072ef1f0ffSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 44085b7e41feSStefano Zampini 44092ef1f0ffSBarry Smith Not Collective 44105b7e41feSStefano Zampini 44115b7e41feSStefano Zampini Input Parameters: 44125b7e41feSStefano Zampini + A - the matrix 441311a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 44145b7e41feSStefano Zampini 44155b7e41feSStefano Zampini Output Parameters: 441620f4b53cSBarry Smith + i - the CSR row pointers 441720f4b53cSBarry Smith - j - the CSR column indices 44185b7e41feSStefano Zampini 44195b7e41feSStefano Zampini Level: developer 44205b7e41feSStefano Zampini 442111a5261eSBarry Smith Note: 44225b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 44235b7e41feSStefano Zampini 44241cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 44255b7e41feSStefano Zampini @*/ 4426d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4427d71ae5a4SJacob Faibussowitsch { 44285f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 44295f101d05SStefano Zampini CsrMatrix *csr; 44305f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 44315f101d05SStefano Zampini 44325f101d05SStefano Zampini PetscFunctionBegin; 44335f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44343ba16761SJacob Faibussowitsch if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 44355f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4436aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 443828b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 44395f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 44405f101d05SStefano Zampini if (i) { 44415f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 44425f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 44435f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 44445f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 44459566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 44465f101d05SStefano Zampini } 44475f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 44485f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 44495f101d05SStefano Zampini } 44505f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 44513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 44525f101d05SStefano Zampini } 44535f101d05SStefano Zampini 44545b7e41feSStefano Zampini /*@C 44552ef1f0ffSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 44565b7e41feSStefano Zampini 44572ef1f0ffSBarry Smith Not Collective 44585b7e41feSStefano Zampini 44595b7e41feSStefano Zampini Input Parameters: 44605b7e41feSStefano Zampini + A - the matrix 44612ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 446220f4b53cSBarry Smith . i - the CSR row pointers 446320f4b53cSBarry Smith - j - the CSR column indices 44645b7e41feSStefano Zampini 44655b7e41feSStefano Zampini Level: developer 44665b7e41feSStefano Zampini 44671cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 44685b7e41feSStefano Zampini @*/ 446920f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4470d71ae5a4SJacob Faibussowitsch { 44715f101d05SStefano Zampini PetscFunctionBegin; 44725f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44735f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 44745f101d05SStefano Zampini if (i) *i = NULL; 44755f101d05SStefano Zampini if (j) *j = NULL; 447620f4b53cSBarry Smith (void)compressed; 44773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 44785f101d05SStefano Zampini } 44795f101d05SStefano Zampini 44805b7e41feSStefano Zampini /*@C 448111a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 44825b7e41feSStefano Zampini 44835b7e41feSStefano Zampini Not Collective 44845b7e41feSStefano Zampini 44855b7e41feSStefano Zampini Input Parameter: 448611a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 44875b7e41feSStefano Zampini 44885b7e41feSStefano Zampini Output Parameter: 44895b7e41feSStefano Zampini . a - pointer to the device data 44905b7e41feSStefano Zampini 44915b7e41feSStefano Zampini Level: developer 44925b7e41feSStefano Zampini 449311a5261eSBarry Smith Note: 449411a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 44955b7e41feSStefano Zampini 44961cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 44975b7e41feSStefano Zampini @*/ 4498d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4499d71ae5a4SJacob Faibussowitsch { 4500ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4501ed502f03SStefano Zampini CsrMatrix *csr; 4502ed502f03SStefano Zampini 4503ed502f03SStefano Zampini PetscFunctionBegin; 4504ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45054f572ea9SToby Isaac PetscAssertPointer(a, 2); 4506ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4507aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 450928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4510ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 451128b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4512ed502f03SStefano Zampini *a = csr->values->data().get(); 45133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4514ed502f03SStefano Zampini } 4515ed502f03SStefano Zampini 45165b7e41feSStefano Zampini /*@C 451711a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 45185b7e41feSStefano Zampini 45195b7e41feSStefano Zampini Not Collective 45205b7e41feSStefano Zampini 45212ef1f0ffSBarry Smith Input Parameters: 45222ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 45232ef1f0ffSBarry Smith - a - pointer to the device data 45245b7e41feSStefano Zampini 45255b7e41feSStefano Zampini Level: developer 45265b7e41feSStefano Zampini 45271cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 45285b7e41feSStefano Zampini @*/ 4529d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4530d71ae5a4SJacob Faibussowitsch { 4531ed502f03SStefano Zampini PetscFunctionBegin; 4532ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45334f572ea9SToby Isaac PetscAssertPointer(a, 2); 4534ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4535ed502f03SStefano Zampini *a = NULL; 45363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4537ed502f03SStefano Zampini } 4538ed502f03SStefano Zampini 45395b7e41feSStefano Zampini /*@C 454011a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45415b7e41feSStefano Zampini 45425b7e41feSStefano Zampini Not Collective 45435b7e41feSStefano Zampini 45445b7e41feSStefano Zampini Input Parameter: 454511a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45465b7e41feSStefano Zampini 45475b7e41feSStefano Zampini Output Parameter: 45485b7e41feSStefano Zampini . a - pointer to the device data 45495b7e41feSStefano Zampini 45505b7e41feSStefano Zampini Level: developer 45515b7e41feSStefano Zampini 455211a5261eSBarry Smith Note: 455311a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 45545b7e41feSStefano Zampini 45551cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 45565b7e41feSStefano Zampini @*/ 4557d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4558d71ae5a4SJacob Faibussowitsch { 4559039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4560039c6fbaSStefano Zampini CsrMatrix *csr; 4561039c6fbaSStefano Zampini 4562039c6fbaSStefano Zampini PetscFunctionBegin; 4563039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45644f572ea9SToby Isaac PetscAssertPointer(a, 2); 4565039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4566aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 456828b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4569039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 457028b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4571039c6fbaSStefano Zampini *a = csr->values->data().get(); 4572039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 45739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 45743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4575039c6fbaSStefano Zampini } 45765b7e41feSStefano Zampini /*@C 457711a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4578039c6fbaSStefano Zampini 45795b7e41feSStefano Zampini Not Collective 45805b7e41feSStefano Zampini 45812ef1f0ffSBarry Smith Input Parameters: 45822ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 45832ef1f0ffSBarry Smith - a - pointer to the device data 45845b7e41feSStefano Zampini 45855b7e41feSStefano Zampini Level: developer 45865b7e41feSStefano Zampini 45871cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 45885b7e41feSStefano Zampini @*/ 4589d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4590d71ae5a4SJacob Faibussowitsch { 4591039c6fbaSStefano Zampini PetscFunctionBegin; 4592039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45934f572ea9SToby Isaac PetscAssertPointer(a, 2); 4594039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 45959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 45969566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4597039c6fbaSStefano Zampini *a = NULL; 45983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4599039c6fbaSStefano Zampini } 4600039c6fbaSStefano Zampini 46015b7e41feSStefano Zampini /*@C 460211a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 46035b7e41feSStefano Zampini 46045b7e41feSStefano Zampini Not Collective 46055b7e41feSStefano Zampini 46065b7e41feSStefano Zampini Input Parameter: 460711a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46085b7e41feSStefano Zampini 46095b7e41feSStefano Zampini Output Parameter: 46105b7e41feSStefano Zampini . a - pointer to the device data 46115b7e41feSStefano Zampini 46125b7e41feSStefano Zampini Level: developer 46135b7e41feSStefano Zampini 461411a5261eSBarry Smith Note: 461511a5261eSBarry Smith Does not trigger host-device copies and flags data validity on the GPU 46165b7e41feSStefano Zampini 46171cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 46185b7e41feSStefano Zampini @*/ 4619d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4620d71ae5a4SJacob Faibussowitsch { 4621ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4622ed502f03SStefano Zampini CsrMatrix *csr; 4623ed502f03SStefano Zampini 4624ed502f03SStefano Zampini PetscFunctionBegin; 4625ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46264f572ea9SToby Isaac PetscAssertPointer(a, 2); 4627ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4628aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 462928b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4630ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 463128b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4632ed502f03SStefano Zampini *a = csr->values->data().get(); 4633039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 46353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4636ed502f03SStefano Zampini } 4637ed502f03SStefano Zampini 46385b7e41feSStefano Zampini /*@C 463911a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 46405b7e41feSStefano Zampini 46415b7e41feSStefano Zampini Not Collective 46425b7e41feSStefano Zampini 46432ef1f0ffSBarry Smith Input Parameters: 46442ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 46452ef1f0ffSBarry Smith - a - pointer to the device data 46465b7e41feSStefano Zampini 46475b7e41feSStefano Zampini Level: developer 46485b7e41feSStefano Zampini 46491cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 46505b7e41feSStefano Zampini @*/ 4651d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4652d71ae5a4SJacob Faibussowitsch { 4653ed502f03SStefano Zampini PetscFunctionBegin; 4654ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46554f572ea9SToby Isaac PetscAssertPointer(a, 2); 4656ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46589566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4659ed502f03SStefano Zampini *a = NULL; 46603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4661ed502f03SStefano Zampini } 4662ed502f03SStefano Zampini 46639371c9d4SSatish Balay struct IJCompare4 { 4664d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4665d71ae5a4SJacob Faibussowitsch { 46660b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 46670b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4668ed502f03SStefano Zampini return false; 4669ed502f03SStefano Zampini } 4670ed502f03SStefano Zampini }; 4671ed502f03SStefano Zampini 46729371c9d4SSatish Balay struct Shift { 4673ed502f03SStefano Zampini int _shift; 4674ed502f03SStefano Zampini 4675ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 46769371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4677ed502f03SStefano Zampini }; 4678ed502f03SStefano Zampini 467921afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4680d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4681d71ae5a4SJacob Faibussowitsch { 4682ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4683ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4684ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4685ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4686ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4687ed502f03SStefano Zampini cusparseStatus_t stat; 4688ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4689ed502f03SStefano Zampini 4690ed502f03SStefano Zampini PetscFunctionBegin; 4691ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4692ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 46934f572ea9SToby Isaac PetscAssertPointer(C, 4); 4694ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4695ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 46965f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 469708401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4698aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4699aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4700ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4701ed502f03SStefano Zampini m = A->rmap->n; 4702ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 47039566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 47049566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 47059566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4706ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4707ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4708ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4709ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4710ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4711ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4712ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4713ed502f03SStefano Zampini c->compressedrow.i = NULL; 4714ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4715ed502f03SStefano Zampini Ccusp->workVector = NULL; 4716ed502f03SStefano Zampini Ccusp->nrows = m; 4717ed502f03SStefano Zampini Ccusp->mat = Cmat; 4718ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4719ed502f03SStefano Zampini Ccsr->num_rows = m; 4720ed502f03SStefano Zampini Ccsr->num_cols = n; 47219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 47229566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 47239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4724f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4725f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4726f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 47279566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 473228b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 473328b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4734ed502f03SStefano Zampini 4735ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4736ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4737ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4738ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4739ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4740ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4741ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4742ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4743ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 47442c4ab24aSJunchao Zhang Ccusp->coords = new THRUSTINTARRAY(c->nz); 4745ed502f03SStefano Zampini if (c->nz) { 47462ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 47472ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 47482ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 47492ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 47502ed87e7eSStefano Zampini 4751ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4752ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4753ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4754ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 47559566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4756ed502f03SStefano Zampini } 47572ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 47582ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4759ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4760ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4761ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4762ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 47639566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4764ed502f03SStefano Zampini } 47652ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 47662ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 47679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 47689371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47699371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47709371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47719371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47722ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 47732ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 47742ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 47758909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4776ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4777ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 47788909a122SStefano Zampini #else 47798909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 47808909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 47818909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 47828909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 47838909a122SStefano Zampini #endif 47842ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 47852ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 47862ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 47872ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 47882ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 47892ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 47902c4ab24aSJunchao Zhang auto p1 = Ccusp->coords->begin(); 47912c4ab24aSJunchao Zhang auto p2 = Ccusp->coords->begin(); 4792ed502f03SStefano Zampini thrust::advance(p2, Annz); 4793792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 47948909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 47958909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 47968909a122SStefano Zampini #endif 47972ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 47982ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 47992ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4800792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 48012ed87e7eSStefano Zampini #else 48022ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 4803792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4804792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 48052ed87e7eSStefano Zampini #endif 48069371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48079371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 48092ed87e7eSStefano Zampini delete wPerm; 48102ed87e7eSStefano Zampini delete Acoo; 48112ed87e7eSStefano Zampini delete Bcoo; 48122ed87e7eSStefano Zampini delete Ccoo; 4813ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48149371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48159371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4816ed502f03SStefano Zampini #endif 48171a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 48189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 48199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4820ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4821ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4822ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4823ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4824ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4825ed502f03SStefano Zampini 48261a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 48271a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4828a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4829ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4830ed502f03SStefano Zampini CmatT->mat = CcsrT; 4831ed502f03SStefano Zampini CcsrT->num_rows = n; 4832ed502f03SStefano Zampini CcsrT->num_cols = m; 4833ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4834ed502f03SStefano Zampini 4835ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4836ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4837ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4838ed502f03SStefano Zampini 48399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4840ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4841ed502f03SStefano Zampini if (AT) { 4842ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4843ed502f03SStefano Zampini thrust::advance(rT, -1); 4844ed502f03SStefano Zampini } 4845ed502f03SStefano Zampini if (BT) { 4846ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4847ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4848ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4849ed502f03SStefano Zampini } 4850ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4851ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4852ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4853ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4854ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4855ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4857ed502f03SStefano Zampini 48589566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 48599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 48609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4861f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4862f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4863f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 48649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4867ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48689371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48699371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4870ed502f03SStefano Zampini #endif 4871ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4872ed502f03SStefano Zampini } 4873ed502f03SStefano Zampini } 4874ed502f03SStefano Zampini 4875ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 48769f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 48779f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4878ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 48797de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4880ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4881ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4882ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4883ed502f03SStefano Zampini jj = *Ccsr->column_indices; 48849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 48859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4886ed502f03SStefano Zampini } else { 48879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 48889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4889ed502f03SStefano Zampini } 48909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 48919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 48929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4893ed502f03SStefano Zampini c->maxnz = c->nz; 4894ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4895ed502f03SStefano Zampini c->rmax = 0; 4896ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4897ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4898ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4899ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4900ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4901ed502f03SStefano Zampini } 49029566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 49039566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4904ed502f03SStefano Zampini (*C)->nonzerostate++; 49059566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 49069566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4907ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4908ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4909ed502f03SStefano Zampini } else { 491008401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4911ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4912ed502f03SStefano Zampini if (c->nz) { 4913ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 49142c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4915aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 491608401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 49179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 49189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 49195f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 49205f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4921ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4922ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4923ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4924aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4925aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4926aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4927aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 49282c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 49292c4ab24aSJunchao Zhang auto pmid = Ccusp->coords->begin(); 4930ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 49319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 49322c4ab24aSJunchao Zhang auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 49339371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4934ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 49359371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 49362c4ab24aSJunchao Zhang auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4937ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 49389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 49391a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 49405f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4941ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4942ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4943ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4944ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4945ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4946ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4947ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 49481a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4949ed502f03SStefano Zampini } 49509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4951ed502f03SStefano Zampini } 4952ed502f03SStefano Zampini } 49539566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4954ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4955ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4956ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 49573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4958ed502f03SStefano Zampini } 4959c215019aSStefano Zampini 4960d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4961d71ae5a4SJacob Faibussowitsch { 4962c215019aSStefano Zampini bool dmem; 4963c215019aSStefano Zampini const PetscScalar *av; 4964c215019aSStefano Zampini 4965c215019aSStefano Zampini PetscFunctionBegin; 4966c215019aSStefano Zampini dmem = isCudaMem(v); 49679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4968c215019aSStefano Zampini if (n && idx) { 4969c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4970c215019aSStefano Zampini widx.assign(idx, idx + n); 49719566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4972c215019aSStefano Zampini 4973c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4974c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4975c215019aSStefano Zampini if (dmem) { 4976c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4977c215019aSStefano Zampini } else { 4978c215019aSStefano Zampini w = new THRUSTARRAY(n); 4979c215019aSStefano Zampini dv = w->data(); 4980c215019aSStefano Zampini } 4981c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4982c215019aSStefano Zampini 4983c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4984c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4985c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 498648a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4987c215019aSStefano Zampini delete w; 4988c215019aSStefano Zampini } else { 49899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4990c215019aSStefano Zampini } 49919566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 49929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 49933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4994c215019aSStefano Zampini } 4995b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 4996