19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 69ae82921SPaul Mullowney 73d13b8fdSMatthew G. Knepley #include <petscconf.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 11af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 129ae82921SPaul Mullowney #undef VecType 133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 16d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 18a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 19d0967f54SJacob Faibussowitsch #endif 20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 21a2cee5feSJed Brown #include <thrust/remove.h> 22a2cee5feSJed Brown #include <thrust/sort.h> 23a2cee5feSJed Brown #include <thrust/unique.h> 24e8d2b73aSMark Adams 25e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 26afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 27afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 28afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 29afb2bd1cSJunchao Zhang 30afb2bd1cSJunchao Zhang typedef enum { 31afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 33afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 35afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 36afb2bd1cSJunchao Zhang 37afb2bd1cSJunchao Zhang typedef enum { 38afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 39afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 40afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 41afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 42afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 47afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 48afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 49afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 50afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 51afb2bd1cSJunchao Zhang 52afb2bd1cSJunchao Zhang typedef enum { 5335cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic 5435cb6cd3SPierre Jolivet CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministic 55afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 56afb2bd1cSJunchao Zhang */ 57afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 59afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 60afb2bd1cSJunchao Zhang #endif 619ae82921SPaul Mullowney 62087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 63087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 656fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 66b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 69d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 72d460d7bfSJunchao Zhang #endif 73dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 74a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7533c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 766fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 786fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 796fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 82e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 839ae82921SPaul Mullowney 847f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 887f756511SDominic Meiser 8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 90a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9157181aedSStefano Zampini 92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 93e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 94219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 95c215019aSStefano Zampini 96d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 97d71ae5a4SJacob Faibussowitsch { 98aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 996e111a19SKarl Rupp 100ca45077fSPaul Mullowney PetscFunctionBegin; 101ca45077fSPaul Mullowney switch (op) { 102d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT: 103d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 104d71ae5a4SJacob Faibussowitsch break; 105d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_ALL: 106d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 107d71ae5a4SJacob Faibussowitsch break; 108d71ae5a4SJacob Faibussowitsch default: 109d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 110ca45077fSPaul Mullowney } 1113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 112ca45077fSPaul Mullowney } 1139ae82921SPaul Mullowney 114e057df02SPaul Mullowney /*@ 11511a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 11611a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 11711a5261eSBarry Smith 118e057df02SPaul Mullowney Not Collective 119e057df02SPaul Mullowney 120e057df02SPaul Mullowney Input Parameters: 12111a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 1222ef1f0ffSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 1232ef1f0ffSBarry Smith `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 12411a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 125e057df02SPaul Mullowney 126e057df02SPaul Mullowney Level: intermediate 127e057df02SPaul Mullowney 1281cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 129e057df02SPaul Mullowney @*/ 130d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 131d71ae5a4SJacob Faibussowitsch { 132e057df02SPaul Mullowney PetscFunctionBegin; 133e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 134cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 1353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 136e057df02SPaul Mullowney } 137e057df02SPaul Mullowney 138d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 139d71ae5a4SJacob Faibussowitsch { 140365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 141365b711fSMark Adams 142365b711fSMark Adams PetscFunctionBegin; 143365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 1443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145365b711fSMark Adams } 146365b711fSMark Adams 147365b711fSMark Adams /*@ 14811a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 149365b711fSMark Adams 150365b711fSMark Adams Input Parameters: 15111a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 15211a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 153365b711fSMark Adams 1542ef1f0ffSBarry Smith Level: intermediate 155365b711fSMark Adams 15611a5261eSBarry Smith Note: 157365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 158365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 159365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 160365b711fSMark Adams 1611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 162365b711fSMark Adams @*/ 163d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 164d71ae5a4SJacob Faibussowitsch { 165365b711fSMark Adams PetscFunctionBegin; 166365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 167cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 1683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 169365b711fSMark Adams } 170365b711fSMark Adams 171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 172d71ae5a4SJacob Faibussowitsch { 173e6e9a74fSStefano Zampini PetscFunctionBegin; 1741a2c6b5cSJunchao Zhang switch (op) { 1751a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1761a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1779566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1781a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1791a2c6b5cSJunchao Zhang break; 180d71ae5a4SJacob Faibussowitsch default: 181d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 182d71ae5a4SJacob Faibussowitsch break; 183e6e9a74fSStefano Zampini } 1843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 185e6e9a74fSStefano Zampini } 186e6e9a74fSStefano Zampini 187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 188d71ae5a4SJacob Faibussowitsch { 189e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1909ae82921SPaul Mullowney PetscBool flg; 191a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1926e111a19SKarl Rupp 1939ae82921SPaul Mullowney PetscFunctionBegin; 194d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 1959ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1969371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 1979566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 198afb2bd1cSJunchao Zhang 1999371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2009566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2019566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2029566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 203afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2049371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 205afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 206b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 207aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 208a435da06SStefano Zampini #else 209aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 210a435da06SStefano Zampini #endif 2119371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 212aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 213afb2bd1cSJunchao Zhang 2149371c9d4SSatish Balay PetscCall( 2159371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 216aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 217afb2bd1cSJunchao Zhang #endif 2184c87dfd4SPaul Mullowney } 219d0609cedSBarry Smith PetscOptionsHeadEnd(); 2203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2219ae82921SPaul Mullowney } 2229ae82921SPaul Mullowney 223b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 224d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 225d460d7bfSJunchao Zhang { 226d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 227d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 228d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 229d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 230d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 231d460d7bfSJunchao Zhang PetscInt *Mi, *Mj, Mnz; 232d460d7bfSJunchao Zhang PetscScalar *Ma; 233d460d7bfSJunchao Zhang 234d460d7bfSJunchao Zhang PetscFunctionBegin; 235d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 236d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 237d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 238d460d7bfSJunchao Zhang Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 239d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &Mi)); 240d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 241d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 242d460d7bfSJunchao Zhang Mi[0] = 0; 243d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 244d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 245d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 246d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 247d460d7bfSJunchao Zhang Mj[Mi[i] + llen] = i; // diagonal entry 248d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 249d460d7bfSJunchao Zhang Mi[i + 1] = Mi[i] + llen + ulen; 250d460d7bfSJunchao Zhang } 251d460d7bfSJunchao Zhang // Copy M (L,U) from host to device 252d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 253d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 254d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 255d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice)); 256d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice)); 257d460d7bfSJunchao Zhang 258d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 259d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 260d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 261d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 262d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 263d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 264d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 265d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 266d460d7bfSJunchao Zhang 267d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 268d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 269d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 270d460d7bfSJunchao Zhang 271d460d7bfSJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 272d460d7bfSJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 273d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 274d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 275d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 276d460d7bfSJunchao Zhang 277d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 278d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 279d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 280d460d7bfSJunchao Zhang 281d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 282d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 283d460d7bfSJunchao Zhang 284d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 285d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 286d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 287d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 288d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 289d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 290d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 291d460d7bfSJunchao Zhang 292d460d7bfSJunchao Zhang // Record for reuse 293d460d7bfSJunchao Zhang fs->csrRowPtr_h = Mi; 294d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 295d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 296d460d7bfSJunchao Zhang } 297d460d7bfSJunchao Zhang // Copy the value 298d460d7bfSJunchao Zhang Mi = fs->csrRowPtr_h; 299d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 300d460d7bfSJunchao Zhang Mnz = Mi[m]; 301d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 302d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 303d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 304d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 305d460d7bfSJunchao Zhang Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 306d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 307d460d7bfSJunchao Zhang } 308d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 309d460d7bfSJunchao Zhang 310d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 311d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 312d460d7bfSJunchao Zhang 313d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 314d460d7bfSJunchao Zhang 315d460d7bfSJunchao Zhang // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve 316d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 317d460d7bfSJunchao Zhang } 318d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 319d460d7bfSJunchao Zhang } 320d460d7bfSJunchao Zhang #else 321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 322d71ae5a4SJacob Faibussowitsch { 3239ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3249ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3259ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 326aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 3279ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 3289ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3299ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3309ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 3319ae82921SPaul Mullowney 3329ae82921SPaul Mullowney PetscFunctionBegin; 3333ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 334c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3359ae82921SPaul Mullowney try { 3369ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3379ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 338da79fbbcSStefano Zampini if (!loTriFactor) { 3392cbc15d9SMark PetscScalar *AALo; 3402cbc15d9SMark 3419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 3429ae82921SPaul Mullowney 3439ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 3459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 3469ae82921SPaul Mullowney 3479ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3489ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 3499ae82921SPaul Mullowney AiLo[n] = nzLower; 3509ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 3519ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 3529ae82921SPaul Mullowney v = aa; 3539ae82921SPaul Mullowney vi = aj; 3549ae82921SPaul Mullowney offset = 1; 3559ae82921SPaul Mullowney rowOffset = 1; 3569ae82921SPaul Mullowney for (i = 1; i < n; i++) { 3579ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 358e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3599ae82921SPaul Mullowney AiLo[i] = rowOffset; 3609ae82921SPaul Mullowney rowOffset += nz + 1; 3619ae82921SPaul Mullowney 3629566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 3639566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 3649ae82921SPaul Mullowney 3659ae82921SPaul Mullowney offset += nz; 3669ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 3679ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 3689ae82921SPaul Mullowney offset += 1; 3699ae82921SPaul Mullowney 3709ae82921SPaul Mullowney v += nz; 3719ae82921SPaul Mullowney vi += nz; 3729ae82921SPaul Mullowney } 3732205254eSKarl Rupp 374aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3759566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 376da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 377aa372e3fSPaul Mullowney /* Create the matrix description */ 3789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 382afb2bd1cSJunchao Zhang #else 3839566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 384afb2bd1cSJunchao Zhang #endif 3859566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 387aa372e3fSPaul Mullowney 388aa372e3fSPaul Mullowney /* set the operation */ 389aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 390aa372e3fSPaul Mullowney 391aa372e3fSPaul Mullowney /* set the matrix */ 392aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 393aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 394aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 395aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 396aa372e3fSPaul Mullowney 397aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 398aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 399aa372e3fSPaul Mullowney 400aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 401aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 402aa372e3fSPaul Mullowney 403aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 404aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 405aa372e3fSPaul Mullowney 406afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4079566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 408261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 4091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4109371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4119371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 4129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 413afb2bd1cSJunchao Zhang #endif 414afb2bd1cSJunchao Zhang 415aa372e3fSPaul Mullowney /* perform the solve analysis */ 4169371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4179f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 4189566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4199566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 420aa372e3fSPaul Mullowney 421da79fbbcSStefano Zampini /* assign the pointer */ 422aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 4232cbc15d9SMark loTriFactor->AA_h = AALo; 4249566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4259566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4269566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 427da79fbbcSStefano Zampini } else { /* update values only */ 42848a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 429da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4302cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 431da79fbbcSStefano Zampini v = aa; 432da79fbbcSStefano Zampini vi = aj; 433da79fbbcSStefano Zampini offset = 1; 434da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 435da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 4369566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 437da79fbbcSStefano Zampini offset += nz; 4382cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 439da79fbbcSStefano Zampini offset += 1; 440da79fbbcSStefano Zampini v += nz; 441da79fbbcSStefano Zampini } 4422cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 4439566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 444da79fbbcSStefano Zampini } 445d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 446d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 447d71ae5a4SJacob Faibussowitsch } 4489ae82921SPaul Mullowney } 4493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4509ae82921SPaul Mullowney } 4519ae82921SPaul Mullowney 452d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 453d71ae5a4SJacob Faibussowitsch { 4549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4559ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4569ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 457aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 4589ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 4599ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 4609ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4619ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 4629ae82921SPaul Mullowney 4639ae82921SPaul Mullowney PetscFunctionBegin; 4643ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 465c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4669ae82921SPaul Mullowney try { 4679ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4689ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 469da79fbbcSStefano Zampini if (!upTriFactor) { 4702cbc15d9SMark PetscScalar *AAUp; 4712cbc15d9SMark 4729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4732cbc15d9SMark 4749ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4779ae82921SPaul Mullowney 4789ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4799ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4809ae82921SPaul Mullowney AiUp[n] = nzUpper; 4819ae82921SPaul Mullowney offset = nzUpper; 4829ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4839ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4849ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4859ae82921SPaul Mullowney 486e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4879ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4889ae82921SPaul Mullowney 489e057df02SPaul Mullowney /* decrement the offset */ 4909ae82921SPaul Mullowney offset -= (nz + 1); 4919ae82921SPaul Mullowney 492e057df02SPaul Mullowney /* first, set the diagonal elements */ 4939ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 49409f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4959ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4969ae82921SPaul Mullowney 4979566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 4989566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 4999ae82921SPaul Mullowney } 5002205254eSKarl Rupp 501aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 5029566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 503da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5042205254eSKarl Rupp 505aa372e3fSPaul Mullowney /* Create the matrix description */ 5069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 5079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 5081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 510afb2bd1cSJunchao Zhang #else 5119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 512afb2bd1cSJunchao Zhang #endif 5139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 5149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 515aa372e3fSPaul Mullowney 516aa372e3fSPaul Mullowney /* set the operation */ 517aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 518aa372e3fSPaul Mullowney 519aa372e3fSPaul Mullowney /* set the matrix */ 520aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 521aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 522aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 523aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 524aa372e3fSPaul Mullowney 525aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 526aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 527aa372e3fSPaul Mullowney 528aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 529aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 530aa372e3fSPaul Mullowney 531aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 532aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 533aa372e3fSPaul Mullowney 534afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 5359566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 536261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 5371b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5389371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5399371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 5409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 541afb2bd1cSJunchao Zhang #endif 542afb2bd1cSJunchao Zhang 543aa372e3fSPaul Mullowney /* perform the solve analysis */ 5449371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5459f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 5469f7ba44dSJacob Faibussowitsch 5479566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5489566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 549aa372e3fSPaul Mullowney 550da79fbbcSStefano Zampini /* assign the pointer */ 551aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 5522cbc15d9SMark upTriFactor->AA_h = AAUp; 5539566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5559566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 556da79fbbcSStefano Zampini } else { 55748a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 558da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 559da79fbbcSStefano Zampini offset = nzUpper; 560da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 561da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 562da79fbbcSStefano Zampini 563da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 564da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 565da79fbbcSStefano Zampini 566da79fbbcSStefano Zampini /* decrement the offset */ 567da79fbbcSStefano Zampini offset -= (nz + 1); 568da79fbbcSStefano Zampini 569da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5702cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 5719566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 572da79fbbcSStefano Zampini } 5732cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5749566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 575da79fbbcSStefano Zampini } 576d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 577d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 578d71ae5a4SJacob Faibussowitsch } 5799ae82921SPaul Mullowney } 5803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5819ae82921SPaul Mullowney } 582d460d7bfSJunchao Zhang #endif 5839ae82921SPaul Mullowney 584d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 585d71ae5a4SJacob Faibussowitsch { 5869ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5879ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 5889ae82921SPaul Mullowney IS isrow = a->row, iscol = a->icol; 5899ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5909ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5919ae82921SPaul Mullowney 5929ae82921SPaul Mullowney PetscFunctionBegin; 59328b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 594b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 595d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 596d460d7bfSJunchao Zhang #else 5979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 599ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 600d460d7bfSJunchao Zhang #endif 601d460d7bfSJunchao Zhang 602aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 6039ae82921SPaul Mullowney 604d460d7bfSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 605e057df02SPaul Mullowney /* lower triangular indices */ 6069566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 607da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 608da79fbbcSStefano Zampini const PetscInt *r; 609da79fbbcSStefano Zampini 6109566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 611aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 612aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 6139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 6149566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 615da79fbbcSStefano Zampini } 6169ae82921SPaul Mullowney 617e057df02SPaul Mullowney /* upper triangular indices */ 6189566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 619da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 620da79fbbcSStefano Zampini const PetscInt *c; 621da79fbbcSStefano Zampini 6229566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 623aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 624aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 6259566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 6269566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 627da79fbbcSStefano Zampini } 6283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6299ae82921SPaul Mullowney } 6309ae82921SPaul Mullowney 631b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 632d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 633d460d7bfSJunchao Zhang { 634d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 635d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 636d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 637d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 638d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 639d460d7bfSJunchao Zhang PetscInt *Mj, Mnz; 640d460d7bfSJunchao Zhang PetscScalar *Ma, *D; 641d460d7bfSJunchao Zhang 642d460d7bfSJunchao Zhang PetscFunctionBegin; 643d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 644d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 645d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 646d460d7bfSJunchao Zhang // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 647d460d7bfSJunchao Zhang Mnz = Ai[m]; // Unz (with the unit diagonal) 648d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 649d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 650d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m, &D)); // the diagonal 651d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 652d460d7bfSJunchao Zhang PetscInt ulen = Ai[i + 1] - Ai[i]; 653d460d7bfSJunchao Zhang Mj[Ai[i]] = i; // diagonal entry 654d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 655d460d7bfSJunchao Zhang } 656d460d7bfSJunchao Zhang // Copy M (U) from host to device 657d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1))); 658d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz)); 659d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz)); 660d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m)); 661d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 662d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 663d460d7bfSJunchao Zhang 664d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 665d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 666d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 667d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 668d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 669d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 670d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 671d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 672d460d7bfSJunchao Zhang 673d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 674d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 675d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 676d460d7bfSJunchao Zhang 677d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 678d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m)); 679d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m)); 680d460d7bfSJunchao Zhang 681d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 682d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 683d460d7bfSJunchao Zhang 684d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers 685d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 686d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 687d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 688d460d7bfSJunchao Zhang 689aaa8cc7dSPierre Jolivet PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 690d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 691d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 692d460d7bfSJunchao Zhang 693d460d7bfSJunchao Zhang // Record for reuse 694d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 695d460d7bfSJunchao Zhang fs->diag_h = D; 696d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 697d460d7bfSJunchao Zhang } 698d460d7bfSJunchao Zhang // Copy the value 699d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 700d460d7bfSJunchao Zhang D = fs->diag_h; 701d460d7bfSJunchao Zhang Mnz = Ai[m]; 702d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 703d460d7bfSJunchao Zhang D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 704d460d7bfSJunchao Zhang Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 705d460d7bfSJunchao Zhang for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 706d460d7bfSJunchao Zhang } 707d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 708d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 709d460d7bfSJunchao Zhang 710d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 711d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 712d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 713d460d7bfSJunchao Zhang } 714d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 715d460d7bfSJunchao Zhang } 716d460d7bfSJunchao Zhang 717d460d7bfSJunchao Zhang // Solve Ut D U x = b 718d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 719d460d7bfSJunchao Zhang { 720d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 721d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 722d460d7bfSJunchao Zhang const PetscScalar *barray; 723d460d7bfSJunchao Zhang PetscScalar *xarray; 724d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 725d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 726d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 727d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 728d460d7bfSJunchao Zhang 729d460d7bfSJunchao Zhang PetscFunctionBegin; 730d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 731d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 732d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 733d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 734d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 735d460d7bfSJunchao Zhang 736d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 737d460d7bfSJunchao Zhang if (fs->rpermIndices) { 738d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 739d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 740d460d7bfSJunchao Zhang } else { 741d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 742d460d7bfSJunchao Zhang } 743d460d7bfSJunchao Zhang 744d460d7bfSJunchao Zhang // Solve Ut Y = X 745d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 746d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 747d460d7bfSJunchao Zhang 748d460d7bfSJunchao Zhang // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 749d460d7bfSJunchao Zhang // It is basically a vector element-wise multiplication, but cublas does not have it! 750d460d7bfSJunchao Zhang PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 751d460d7bfSJunchao Zhang 752d460d7bfSJunchao Zhang // Solve U X = Y 753d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 754d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 755d460d7bfSJunchao Zhang } else { 756d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 757d460d7bfSJunchao Zhang } 758d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 759d460d7bfSJunchao Zhang 760d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 761d460d7bfSJunchao Zhang if (fs->cpermIndices) { 762d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 763d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 764d460d7bfSJunchao Zhang } 765d460d7bfSJunchao Zhang 766d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 767d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 768d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 769d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 770d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 771d460d7bfSJunchao Zhang } 772d460d7bfSJunchao Zhang #else 773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 774d71ae5a4SJacob Faibussowitsch { 775087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 776087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 777aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 778aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 779087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 780087f3262SPaul Mullowney PetscScalar *AAUp; 781087f3262SPaul Mullowney PetscScalar *AALo; 782087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 783087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 784087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 785087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 786087f3262SPaul Mullowney 787087f3262SPaul Mullowney PetscFunctionBegin; 7883ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 789c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 790087f3262SPaul Mullowney try { 7919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 7929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 793da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 794087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7959566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 7969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 797087f3262SPaul Mullowney 798087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 799087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 800087f3262SPaul Mullowney AiUp[n] = nzUpper; 801087f3262SPaul Mullowney offset = 0; 802087f3262SPaul Mullowney for (i = 0; i < n; i++) { 803087f3262SPaul Mullowney /* set the pointers */ 804087f3262SPaul Mullowney v = aa + ai[i]; 805087f3262SPaul Mullowney vj = aj + ai[i]; 806087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 807087f3262SPaul Mullowney 808087f3262SPaul Mullowney /* first, set the diagonal elements */ 809087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 81009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 811087f3262SPaul Mullowney AiUp[i] = offset; 81209f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 813087f3262SPaul Mullowney 814087f3262SPaul Mullowney offset += 1; 815087f3262SPaul Mullowney if (nz > 0) { 8169566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 8179566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 818087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 819087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 820087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 821087f3262SPaul Mullowney } 822087f3262SPaul Mullowney offset += nz; 823087f3262SPaul Mullowney } 824087f3262SPaul Mullowney } 825087f3262SPaul Mullowney 826aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8279566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 828da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 829087f3262SPaul Mullowney 830aa372e3fSPaul Mullowney /* Create the matrix description */ 8319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 8329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 835afb2bd1cSJunchao Zhang #else 8369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 837afb2bd1cSJunchao Zhang #endif 8389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 840087f3262SPaul Mullowney 841aa372e3fSPaul Mullowney /* set the matrix */ 842aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 843aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 844aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 845aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 846aa372e3fSPaul Mullowney 847aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 848aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 849aa372e3fSPaul Mullowney 850aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 851aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 852aa372e3fSPaul Mullowney 853aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 854aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 855aa372e3fSPaul Mullowney 856afb2bd1cSJunchao Zhang /* set the operation */ 857afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 858afb2bd1cSJunchao Zhang 859afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 861261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 8621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8649371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 8659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 866afb2bd1cSJunchao Zhang #endif 867afb2bd1cSJunchao Zhang 868aa372e3fSPaul Mullowney /* perform the solve analysis */ 8699371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8709f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 8719f7ba44dSJacob Faibussowitsch 8729566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8739566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 874aa372e3fSPaul Mullowney 875da79fbbcSStefano Zampini /* assign the pointer */ 876aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 877aa372e3fSPaul Mullowney 878aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8799566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 880da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 881aa372e3fSPaul Mullowney 882aa372e3fSPaul Mullowney /* Create the matrix description */ 8839566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8869566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 887afb2bd1cSJunchao Zhang #else 8889566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 889afb2bd1cSJunchao Zhang #endif 8909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 892aa372e3fSPaul Mullowney 893aa372e3fSPaul Mullowney /* set the operation */ 894aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 895aa372e3fSPaul Mullowney 896aa372e3fSPaul Mullowney /* set the matrix */ 897aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 898aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 899aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 900aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 901aa372e3fSPaul Mullowney 902aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 903aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 904aa372e3fSPaul Mullowney 905aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 906aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 907aa372e3fSPaul Mullowney 908aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 909aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 910aa372e3fSPaul Mullowney 911afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9129566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 913261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 9141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9159371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9169371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 9179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 918afb2bd1cSJunchao Zhang #endif 919afb2bd1cSJunchao Zhang 920aa372e3fSPaul Mullowney /* perform the solve analysis */ 9219371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9229f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 9239f7ba44dSJacob Faibussowitsch 9249566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9259566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 926aa372e3fSPaul Mullowney 927da79fbbcSStefano Zampini /* assign the pointer */ 928aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 929087f3262SPaul Mullowney 9309566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 9319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 9329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 933da79fbbcSStefano Zampini } else { 934da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 935da79fbbcSStefano Zampini offset = 0; 936da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 937da79fbbcSStefano Zampini /* set the pointers */ 938da79fbbcSStefano Zampini v = aa + ai[i]; 939da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 940da79fbbcSStefano Zampini 941da79fbbcSStefano Zampini /* first, set the diagonal elements */ 942da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 943da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 944da79fbbcSStefano Zampini 945da79fbbcSStefano Zampini offset += 1; 946da79fbbcSStefano Zampini if (nz > 0) { 9479566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 948da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 949da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 950da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 951da79fbbcSStefano Zampini } 952da79fbbcSStefano Zampini offset += nz; 953da79fbbcSStefano Zampini } 954da79fbbcSStefano Zampini } 95528b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 95628b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 957da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 958da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 9599566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 960da79fbbcSStefano Zampini } 9619566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9629566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 963d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 964d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 965d71ae5a4SJacob Faibussowitsch } 966087f3262SPaul Mullowney } 9673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 968087f3262SPaul Mullowney } 969d460d7bfSJunchao Zhang #endif 970087f3262SPaul Mullowney 971d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 972d71ae5a4SJacob Faibussowitsch { 973087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 974087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 975087f3262SPaul Mullowney IS ip = a->row; 976087f3262SPaul Mullowney PetscBool perm_identity; 977087f3262SPaul Mullowney PetscInt n = A->rmap->n; 978087f3262SPaul Mullowney 979087f3262SPaul Mullowney PetscFunctionBegin; 98028b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 981d460d7bfSJunchao Zhang 982b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 983d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 984d460d7bfSJunchao Zhang #else 9859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 986ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 987d460d7bfSJunchao Zhang #endif 988aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 989aa372e3fSPaul Mullowney 990da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 991da79fbbcSStefano Zampini 992087f3262SPaul Mullowney /* lower triangular indices */ 9939566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 994087f3262SPaul Mullowney if (!perm_identity) { 9954e4bbfaaSStefano Zampini IS iip; 996da79fbbcSStefano Zampini const PetscInt *irip, *rip; 9974e4bbfaaSStefano Zampini 9989566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 9999566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 10009566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 1001aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 1002aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 1003aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 10044e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 10059566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 10069566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 10079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 10089566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1009da79fbbcSStefano Zampini } 10103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1011087f3262SPaul Mullowney } 1012087f3262SPaul Mullowney 1013d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1014d71ae5a4SJacob Faibussowitsch { 1015087f3262SPaul Mullowney PetscFunctionBegin; 10169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 10179566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1018ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1019d460d7bfSJunchao Zhang 1020b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1021d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1022d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1023d460d7bfSJunchao Zhang #else 1024087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1025d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1026d460d7bfSJunchao Zhang IS ip = b->row; 1027d460d7bfSJunchao Zhang PetscBool perm_identity; 1028d460d7bfSJunchao Zhang 10299566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 1030087f3262SPaul Mullowney if (perm_identity) { 1031087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1032087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1033087f3262SPaul Mullowney } else { 1034087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1035087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1036d460d7bfSJunchao Zhang } 1037d460d7bfSJunchao Zhang #endif 10384e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10394e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1040087f3262SPaul Mullowney 1041087f3262SPaul Mullowney /* get the triangular factors */ 10429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 10433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1044087f3262SPaul Mullowney } 10459ae82921SPaul Mullowney 1046b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1048d71ae5a4SJacob Faibussowitsch { 1049bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1050aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1051aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1052da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1053da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1054aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1055aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1056aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1057aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 1058b175d8bbSPaul Mullowney 1059bda325fcSPaul Mullowney PetscFunctionBegin; 1060aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10619566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1062da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1063aa372e3fSPaul Mullowney 1064aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1065aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1066aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 10679371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1068aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1069aa372e3fSPaul Mullowney 1070aa372e3fSPaul Mullowney /* Create the matrix description */ 10719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1076aa372e3fSPaul Mullowney 1077aa372e3fSPaul Mullowney /* set the operation */ 1078aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1079aa372e3fSPaul Mullowney 1080aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1081aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1082afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1083afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1084aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1085afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1086afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1087afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1088aa372e3fSPaul Mullowney 1089aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1090afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10919371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 10929371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10939371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1095afb2bd1cSJunchao Zhang #endif 1096afb2bd1cSJunchao Zhang 10979566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10989f7ba44dSJacob Faibussowitsch { 10999f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11009f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 11019371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1102afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11039f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1104afb2bd1cSJunchao Zhang #else 11059f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1106afb2bd1cSJunchao Zhang #endif 11079f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11089f7ba44dSJacob Faibussowitsch } 11099f7ba44dSJacob Faibussowitsch 11109566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11119566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1112aa372e3fSPaul Mullowney 1113afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11149566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1115261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 11161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11179371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11189371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 11199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1120afb2bd1cSJunchao Zhang #endif 1121afb2bd1cSJunchao Zhang 1122afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11239371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11249f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 11259f7ba44dSJacob Faibussowitsch 11269566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11279566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1128aa372e3fSPaul Mullowney 1129da79fbbcSStefano Zampini /* assign the pointer */ 1130aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1131aa372e3fSPaul Mullowney 1132aa372e3fSPaul Mullowney /*********************************************/ 1133aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1134aa372e3fSPaul Mullowney /*********************************************/ 1135aa372e3fSPaul Mullowney 1136aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 11379566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1138da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1139aa372e3fSPaul Mullowney 1140aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1141aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1142aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 11439371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1144aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1145aa372e3fSPaul Mullowney 1146aa372e3fSPaul Mullowney /* Create the matrix description */ 11479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1152aa372e3fSPaul Mullowney 1153aa372e3fSPaul Mullowney /* set the operation */ 1154aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1155aa372e3fSPaul Mullowney 1156aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1157aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1158afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1159afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1160aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1161afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1162afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1163afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1164aa372e3fSPaul Mullowney 1165aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1166afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11679371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 11689371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11699371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1171afb2bd1cSJunchao Zhang #endif 1172afb2bd1cSJunchao Zhang 11739566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 11749f7ba44dSJacob Faibussowitsch { 11759f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11769f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 11779371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1178afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11799f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1180afb2bd1cSJunchao Zhang #else 11819f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1182afb2bd1cSJunchao Zhang #endif 11839f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11849f7ba44dSJacob Faibussowitsch } 1185d49cd2b7SBarry Smith 11869566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11879566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1188aa372e3fSPaul Mullowney 1189afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11909566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1191261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 11921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11939371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 11949371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 11959566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1196afb2bd1cSJunchao Zhang #endif 1197afb2bd1cSJunchao Zhang 1198afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11995f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 12009371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 12019f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1202d49cd2b7SBarry Smith 12039566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 12049566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1205aa372e3fSPaul Mullowney 1206da79fbbcSStefano Zampini /* assign the pointer */ 1207aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 12083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1209bda325fcSPaul Mullowney } 1210d460d7bfSJunchao Zhang #endif 1211bda325fcSPaul Mullowney 12129371c9d4SSatish Balay struct PetscScalarToPetscInt { 12139371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1214a49f1ed0SStefano Zampini }; 1215a49f1ed0SStefano Zampini 1216d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1217d71ae5a4SJacob Faibussowitsch { 1218aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1219a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1220bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1221bda325fcSPaul Mullowney cusparseStatus_t stat; 1222aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1223b175d8bbSPaul Mullowney 1224bda325fcSPaul Mullowney PetscFunctionBegin; 12259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1226a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 122728b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1228a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 122908401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 12303ba16761SJacob Faibussowitsch if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 12319566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 12329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 123348a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1234a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1235aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1237aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1240aa372e3fSPaul Mullowney 1241b06137fdSPaul Mullowney /* set alpha and beta */ 12429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 12439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 12449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 12459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1248b06137fdSPaul Mullowney 1249aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1250aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1251a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1252554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1253554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1254aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1255a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1256aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1257aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1258a3fdcf43SKarl Rupp 1259ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 126081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1261afb2bd1cSJunchao Zhang 1262afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 12633606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 12649371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12659371c9d4SSatish Balay indexBase, cusparse_scalartype); 12669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12673606e59fSJunchao Zhang #else 12683606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12693606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12703606e59fSJunchao Zhang 12713606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12723606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12733606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12743606e59fSJunchao Zhang */ 12753606e59fSJunchao Zhang if (matrixT->num_entries) { 12769371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 12779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12783606e59fSJunchao Zhang 12793606e59fSJunchao Zhang } else { 12803606e59fSJunchao Zhang matstructT->matDescr = NULL; 12813606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 12823606e59fSJunchao Zhang } 12833606e59fSJunchao Zhang #endif 1284afb2bd1cSJunchao Zhang #endif 1285aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1286afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1287afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1288afb2bd1cSJunchao Zhang #else 1289aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 129051c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 129151c6d536SStefano Zampini /* First convert HYB to CSR */ 1292aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1293aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1294aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1295aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1296aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1297aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1298aa372e3fSPaul Mullowney 12999371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 13009371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1301aa372e3fSPaul Mullowney 1302aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1303aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1304aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1305aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1306aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1307aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1308aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1309aa372e3fSPaul Mullowney 13109371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 13119371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13129371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1313aa372e3fSPaul Mullowney 1314aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1315aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 13179371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 13189371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 13199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1320aa372e3fSPaul Mullowney 1321aa372e3fSPaul Mullowney /* assign the pointer */ 1322aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13231a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1324aa372e3fSPaul Mullowney /* delete temporaries */ 1325aa372e3fSPaul Mullowney if (tempT) { 1326aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1327aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1328aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1329aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1330087f3262SPaul Mullowney } 1331aa372e3fSPaul Mullowney if (temp) { 1332aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1333aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1334aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1335aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1336aa372e3fSPaul Mullowney } 1337afb2bd1cSJunchao Zhang #endif 1338aa372e3fSPaul Mullowney } 1339a49f1ed0SStefano Zampini } 1340a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1341a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1342a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 134328b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 134428b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 134528b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 134628b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 134728b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 134828b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 134928b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 135028b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1351a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1352a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1353a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 13549566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1355a49f1ed0SStefano Zampini } 1356a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1357a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1358792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1359a49f1ed0SStefano Zampini 1360a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1361a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1362a49f1ed0SStefano Zampini void *csr2cscBuffer; 1363a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 13649371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 13659371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 13669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 13679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1368a49f1ed0SStefano Zampini #endif 1369a49f1ed0SStefano Zampini 13701a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13711a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13721a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13731a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13741a2c6b5cSJunchao Zhang 13751a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13761a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13771a2c6b5cSJunchao Zhang */ 13789371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13809371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 13819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1382a49f1ed0SStefano Zampini #else 13839371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1385a49f1ed0SStefano Zampini #endif 13861a2c6b5cSJunchao Zhang } else { 13871a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 13881a2c6b5cSJunchao Zhang } 13891a2c6b5cSJunchao Zhang 1390a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1391792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1392a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13939566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1394a49f1ed0SStefano Zampini #endif 1395a49f1ed0SStefano Zampini } 13969371c9d4SSatish Balay PetscCallThrust( 13979371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1398a49f1ed0SStefano Zampini } 13999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14009566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1401213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1402213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1403aa372e3fSPaul Mullowney /* assign the pointer */ 1404aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 14051a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 14063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1407bda325fcSPaul Mullowney } 1408bda325fcSPaul Mullowney 1409b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1410d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1411d460d7bfSJunchao Zhang { 1412d460d7bfSJunchao Zhang const PetscScalar *barray; 1413d460d7bfSJunchao Zhang PetscScalar *xarray; 1414d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1415d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1416d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1417d460d7bfSJunchao Zhang const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1418d460d7bfSJunchao Zhang const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1419d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1420d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1421d460d7bfSJunchao Zhang 1422d460d7bfSJunchao Zhang PetscFunctionBegin; 1423d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1424d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1425d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1426d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1427d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1428d460d7bfSJunchao Zhang 1429d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1430d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1431d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1432d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1433d460d7bfSJunchao Zhang } else { 1434d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1435d460d7bfSJunchao Zhang } 1436d460d7bfSJunchao Zhang 1437d460d7bfSJunchao Zhang // Solve L Y = X 1438d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1439d460d7bfSJunchao Zhang // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1440d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1441d460d7bfSJunchao Zhang 1442d460d7bfSJunchao Zhang // Solve U X = Y 1443d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1444d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1445d460d7bfSJunchao Zhang } else { 1446d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1447d460d7bfSJunchao Zhang } 1448d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1449d460d7bfSJunchao Zhang 1450d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1451d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1452d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1453d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1454d460d7bfSJunchao Zhang } 1455d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1456d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1457d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1458d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1459d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1460d460d7bfSJunchao Zhang } 1461d460d7bfSJunchao Zhang 1462d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1463d460d7bfSJunchao Zhang { 1464d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1465d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1466d460d7bfSJunchao Zhang const PetscScalar *barray; 1467d460d7bfSJunchao Zhang PetscScalar *xarray; 1468d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1469d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1470d460d7bfSJunchao Zhang const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1471d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1472d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1473d460d7bfSJunchao Zhang 1474d460d7bfSJunchao Zhang PetscFunctionBegin; 1475d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1476d460d7bfSJunchao Zhang if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1477d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1478d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1479d460d7bfSJunchao Zhang fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1480d460d7bfSJunchao Zhang 1481d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1482d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1483d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1484d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1485d460d7bfSJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 1486d460d7bfSJunchao Zhang } 1487d460d7bfSJunchao Zhang 1488d460d7bfSJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 1489d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1490d460d7bfSJunchao Zhang 1491d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1492d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1493d460d7bfSJunchao Zhang } 1494d460d7bfSJunchao Zhang 1495d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1496d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1497d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1498d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1499d460d7bfSJunchao Zhang 1500d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1501d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1502d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1503d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1504d460d7bfSJunchao Zhang } else { 1505d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1506d460d7bfSJunchao Zhang } 1507d460d7bfSJunchao Zhang 1508d460d7bfSJunchao Zhang // Solve Ut Y = X 1509d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1510d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1511d460d7bfSJunchao Zhang 1512d460d7bfSJunchao Zhang // Solve Lt X = Y 1513d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1514d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1515d460d7bfSJunchao Zhang } else { 1516d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1517d460d7bfSJunchao Zhang } 1518d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1519d460d7bfSJunchao Zhang 1520d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1521d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1522d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1523d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1524d460d7bfSJunchao Zhang } 1525d460d7bfSJunchao Zhang 1526d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1527d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1528d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1529d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1530d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1531d460d7bfSJunchao Zhang } 1532d460d7bfSJunchao Zhang #else 1533a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1534d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1535d71ae5a4SJacob Faibussowitsch { 1536c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1537465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1538465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1539465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1540465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1541bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1542aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1543aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1544aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1545bda325fcSPaul Mullowney 1546bda325fcSPaul Mullowney PetscFunctionBegin; 1547aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1548aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1550aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1551aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1552bda325fcSPaul Mullowney } 1553bda325fcSPaul Mullowney 1554bda325fcSPaul Mullowney /* Get the GPU pointers */ 15559566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 15569566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1557c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1558c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1559bda325fcSPaul Mullowney 15609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1561aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 15629371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1563aa372e3fSPaul Mullowney 1564aa372e3fSPaul Mullowney /* First, solve U */ 15659f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 15669f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1567aa372e3fSPaul Mullowney 1568aa372e3fSPaul Mullowney /* Then, solve L */ 15699f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 15709f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1571aa372e3fSPaul Mullowney 1572aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 15739371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1574aa372e3fSPaul Mullowney 1575aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1576a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1577bda325fcSPaul Mullowney 1578bda325fcSPaul Mullowney /* restore */ 15799566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 15809566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 15819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 15833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1584bda325fcSPaul Mullowney } 1585bda325fcSPaul Mullowney 1586d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1587d71ae5a4SJacob Faibussowitsch { 1588465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1589465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1590bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1591aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1592aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1593aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1594bda325fcSPaul Mullowney 1595bda325fcSPaul Mullowney PetscFunctionBegin; 1596aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1597aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1599aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1600aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1601bda325fcSPaul Mullowney } 1602bda325fcSPaul Mullowney 1603bda325fcSPaul Mullowney /* Get the GPU pointers */ 16049566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16059566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1606bda325fcSPaul Mullowney 16079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1608aa372e3fSPaul Mullowney /* First, solve U */ 16099f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 16109f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1611aa372e3fSPaul Mullowney 1612aa372e3fSPaul Mullowney /* Then, solve L */ 16139f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 16149f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1615bda325fcSPaul Mullowney 1616bda325fcSPaul Mullowney /* restore */ 16179566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16189566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1622bda325fcSPaul Mullowney } 1623bda325fcSPaul Mullowney 1624d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1625d71ae5a4SJacob Faibussowitsch { 1626465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1627465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1628465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1629465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16309ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1631aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1632aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1633aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16349ae82921SPaul Mullowney 16359ae82921SPaul Mullowney PetscFunctionBegin; 1636e057df02SPaul Mullowney /* Get the GPU pointers */ 16379566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16389566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1639c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1640c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16419ae82921SPaul Mullowney 16429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1643aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 16449371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1645aa372e3fSPaul Mullowney 1646aa372e3fSPaul Mullowney /* Next, solve L */ 16479f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16489f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1649aa372e3fSPaul Mullowney 1650aa372e3fSPaul Mullowney /* Then, solve U */ 16519f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16529f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1653d49cd2b7SBarry Smith 16544e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 16559371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 16569ae82921SPaul Mullowney 16579566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16589566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16629ae82921SPaul Mullowney } 16639ae82921SPaul Mullowney 1664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1665d71ae5a4SJacob Faibussowitsch { 1666465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1667465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1669aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1670aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1671aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16729ae82921SPaul Mullowney 16739ae82921SPaul Mullowney PetscFunctionBegin; 1674e057df02SPaul Mullowney /* Get the GPU pointers */ 16759566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16769566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 16779ae82921SPaul Mullowney 16789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1679aa372e3fSPaul Mullowney /* First, solve L */ 16809f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16819f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1682d49cd2b7SBarry Smith 1683aa372e3fSPaul Mullowney /* Next, solve U */ 16849f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16859f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 16869ae82921SPaul Mullowney 16879566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16889566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16929ae82921SPaul Mullowney } 1693d460d7bfSJunchao Zhang #endif 16949ae82921SPaul Mullowney 1695b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 16968eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1697d71ae5a4SJacob Faibussowitsch { 1698da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1699da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1700da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1701da112707SJunchao Zhang CsrMatrix *Acsr; 1702da112707SJunchao Zhang PetscInt m, nz; 1703da112707SJunchao Zhang PetscBool flg; 1704da112707SJunchao Zhang 1705da112707SJunchao Zhang PetscFunctionBegin; 1706da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1707da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1708da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1709da112707SJunchao Zhang } 1710da112707SJunchao Zhang 1711da112707SJunchao Zhang /* Copy A's value to fact */ 1712da112707SJunchao Zhang m = fact->rmap->n; 1713da112707SJunchao Zhang nz = aij->nz; 1714da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1715da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1716da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1717da112707SJunchao Zhang 1718da112707SJunchao Zhang /* Factorize fact inplace */ 17199371c9d4SSatish Balay if (m) 17209371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1721d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1722da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1723da112707SJunchao Zhang int numerical_zero; 1724da112707SJunchao Zhang cusparseStatus_t status; 1725da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1726da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1727da112707SJunchao Zhang } 1728da112707SJunchao Zhang 172912ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 173012ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 173112ba2bc6SJunchao Zhang */ 17329371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1733da112707SJunchao Zhang 17349371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1735da112707SJunchao Zhang 173612ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 173712ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 173812ba2bc6SJunchao Zhang 1739da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1740d460d7bfSJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1741d460d7bfSJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1742da112707SJunchao Zhang fact->ops->matsolve = NULL; 1743da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1744da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 17453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1746da112707SJunchao Zhang } 1747da112707SJunchao Zhang 17488eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1749d71ae5a4SJacob Faibussowitsch { 1750da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1751da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1752da112707SJunchao Zhang PetscInt m, nz; 1753da112707SJunchao Zhang 1754da112707SJunchao Zhang PetscFunctionBegin; 1755da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1756da112707SJunchao Zhang PetscInt i; 1757da112707SJunchao Zhang PetscBool flg, missing; 1758da112707SJunchao Zhang 1759da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1760da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1761da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1762da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1763da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1764da112707SJunchao Zhang } 1765da112707SJunchao Zhang 1766da112707SJunchao Zhang /* Free the old stale stuff */ 1767da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1768da112707SJunchao Zhang 1769da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1770da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1771da112707SJunchao Zhang */ 1772da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1773da112707SJunchao Zhang 1774da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1775da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1776da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1777da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1778da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1779da112707SJunchao Zhang 1780da112707SJunchao Zhang aij->row = NULL; 1781da112707SJunchao Zhang aij->col = NULL; 1782da112707SJunchao Zhang 1783da112707SJunchao Zhang /* ====================================================================== */ 1784da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1785da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1786da112707SJunchao Zhang /* ====================================================================== */ 1787da112707SJunchao Zhang const int *Ai, *Aj; 1788da112707SJunchao Zhang 1789da112707SJunchao Zhang m = fact->rmap->n; 1790da112707SJunchao Zhang nz = aij->nz; 1791da112707SJunchao Zhang 1792d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 1793d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 1794d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz)); 1795d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1796d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1797d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1798da112707SJunchao Zhang 1799da112707SJunchao Zhang /* ====================================================================== */ 1800da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1801da112707SJunchao Zhang /* ====================================================================== */ 1802da112707SJunchao Zhang cusparseFillMode_t fillMode; 1803da112707SJunchao Zhang cusparseDiagType_t diagType; 1804da112707SJunchao Zhang 1805da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1806da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1807da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1808da112707SJunchao Zhang 1809da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1810da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1811da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1812da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1813da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1814da112707SJunchao Zhang */ 1815da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1816da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1817d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18189371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18199371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1820da112707SJunchao Zhang 1821da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1822da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1823d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18249371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1826da112707SJunchao Zhang 1827da112707SJunchao Zhang /* ========================================================================= */ 1828da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1829da112707SJunchao Zhang /* ========================================================================= */ 1830da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 18319371c9d4SSatish Balay if (m) 18329371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1833d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1834da112707SJunchao Zhang 1835da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1836da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1837da112707SJunchao Zhang 1838da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1839da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1840da112707SJunchao Zhang 1841da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18429371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1843da112707SJunchao Zhang 1844da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 18459371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1846da112707SJunchao Zhang 1847da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 184812ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 184912ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 185012ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1851da112707SJunchao Zhang */ 185212ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 185312ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 185412ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1855da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 185612ba2bc6SJunchao Zhang } else { 185712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 185812ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1859da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 186012ba2bc6SJunchao Zhang } 1861da112707SJunchao Zhang 1862da112707SJunchao Zhang /* ========================================================================== */ 1863da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1864da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1865da112707SJunchao Zhang /* ========================================================================== */ 1866da112707SJunchao Zhang int structural_zero; 1867da112707SJunchao Zhang cusparseStatus_t status; 1868da112707SJunchao Zhang 1869da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 18709371c9d4SSatish Balay if (m) 18719371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1872d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1873da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1874da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1875da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1876da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1877da112707SJunchao Zhang } 1878da112707SJunchao Zhang 1879da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 18800dd8c0acSJunchao Zhang { 1881da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 18820dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1883da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1884da112707SJunchao Zhang 1885da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1886da112707SJunchao Zhang Ai = Aseq->i; 1887da112707SJunchao Zhang Adiag = Aseq->diag; 1888da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1889da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1890da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1891da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1892da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1893da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1894da112707SJunchao Zhang */ 1895da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1896da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1897da112707SJunchao Zhang } 1898da112707SJunchao Zhang } 1899da112707SJunchao Zhang fs->numericFactFlops = flops; 19000dd8c0acSJunchao Zhang } 1901da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 19023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1903da112707SJunchao Zhang } 1904da112707SJunchao Zhang 1905d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1906d71ae5a4SJacob Faibussowitsch { 1907da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1908da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1909da112707SJunchao Zhang const PetscScalar *barray; 1910da112707SJunchao Zhang PetscScalar *xarray; 1911da112707SJunchao Zhang 1912da112707SJunchao Zhang PetscFunctionBegin; 1913da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1914da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1915da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1916da112707SJunchao Zhang 1917da112707SJunchao Zhang /* Solve L*y = b */ 1918da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1919da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 19209371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 19219371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1922da112707SJunchao Zhang 1923da112707SJunchao Zhang /* Solve Lt*x = y */ 1924da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 19259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 19269371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1927da112707SJunchao Zhang 1928da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1929da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1930da112707SJunchao Zhang 1931da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1932da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 19333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1934da112707SJunchao Zhang } 1935da112707SJunchao Zhang 19368eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1937d71ae5a4SJacob Faibussowitsch { 1938da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1939da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1940da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1941da112707SJunchao Zhang CsrMatrix *Acsr; 1942da112707SJunchao Zhang PetscInt m, nz; 1943da112707SJunchao Zhang PetscBool flg; 1944da112707SJunchao Zhang 1945da112707SJunchao Zhang PetscFunctionBegin; 1946da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1947da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1948da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1949da112707SJunchao Zhang } 1950da112707SJunchao Zhang 1951da112707SJunchao Zhang /* Copy A's value to fact */ 1952da112707SJunchao Zhang m = fact->rmap->n; 1953da112707SJunchao Zhang nz = aij->nz; 1954da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1955da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1956da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1957da112707SJunchao Zhang 1958da112707SJunchao Zhang /* Factorize fact inplace */ 1959da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1960da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1961da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1962da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1963da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1964da112707SJunchao Zhang */ 1965d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1966da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1967da112707SJunchao Zhang int numerical_zero; 1968da112707SJunchao Zhang cusparseStatus_t status; 1969da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1970da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1971da112707SJunchao Zhang } 1972da112707SJunchao Zhang 19739371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1974da112707SJunchao Zhang 1975da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1976da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1977da112707SJunchao Zhang */ 19789371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1979da112707SJunchao Zhang 1980da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1981da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1982da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1983da112707SJunchao Zhang fact->ops->matsolve = NULL; 1984da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1985da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 19863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1987da112707SJunchao Zhang } 1988da112707SJunchao Zhang 19898eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 1990d71ae5a4SJacob Faibussowitsch { 1991da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1992da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1993da112707SJunchao Zhang PetscInt m, nz; 1994da112707SJunchao Zhang 1995da112707SJunchao Zhang PetscFunctionBegin; 1996da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1997da112707SJunchao Zhang PetscInt i; 1998da112707SJunchao Zhang PetscBool flg, missing; 1999da112707SJunchao Zhang 2000da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2001da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2002da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2003da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 2004da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2005da112707SJunchao Zhang } 2006da112707SJunchao Zhang 2007da112707SJunchao Zhang /* Free the old stale stuff */ 2008da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2009da112707SJunchao Zhang 2010da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2011da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2012da112707SJunchao Zhang */ 2013da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2014da112707SJunchao Zhang 2015da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2016da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2017da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2018da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2019da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2020da112707SJunchao Zhang 2021da112707SJunchao Zhang aij->row = NULL; 2022da112707SJunchao Zhang aij->col = NULL; 2023da112707SJunchao Zhang 2024da112707SJunchao Zhang /* ====================================================================== */ 2025da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2026da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2027da112707SJunchao Zhang /* ====================================================================== */ 2028da112707SJunchao Zhang const int *Ai, *Aj; 2029da112707SJunchao Zhang 2030da112707SJunchao Zhang m = fact->rmap->n; 2031da112707SJunchao Zhang nz = aij->nz; 2032da112707SJunchao Zhang 2033d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1))); 2034d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz)); 2035da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2036da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2037d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2038d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2039da112707SJunchao Zhang 2040da112707SJunchao Zhang /* ====================================================================== */ 2041da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2042da112707SJunchao Zhang /* ====================================================================== */ 2043da112707SJunchao Zhang cusparseFillMode_t fillMode; 2044da112707SJunchao Zhang cusparseDiagType_t diagType; 2045da112707SJunchao Zhang 2046da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2047da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2048da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2049da112707SJunchao Zhang 2050da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2051da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2052da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2053da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2054da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2055da112707SJunchao Zhang */ 2056da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2057da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2058d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 20599371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 20609371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2061da112707SJunchao Zhang 2062da112707SJunchao Zhang /* ========================================================================= */ 2063da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2064da112707SJunchao Zhang /* ========================================================================= */ 2065da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2066d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2067da112707SJunchao Zhang 2068da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2069da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2070da112707SJunchao Zhang 2071da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2072da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2073da112707SJunchao Zhang 2074da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 20759371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2076da112707SJunchao Zhang 2077da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 20789371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2079da112707SJunchao Zhang 208012ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 208112ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 208212ba2bc6SJunchao Zhang */ 208312ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 208412ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 208512ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 2086da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 208712ba2bc6SJunchao Zhang } else { 208812ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 208912ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 209012ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 209112ba2bc6SJunchao Zhang } 2092da112707SJunchao Zhang 2093da112707SJunchao Zhang /* ========================================================================== */ 2094da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2095da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2096da112707SJunchao Zhang /* ========================================================================== */ 2097da112707SJunchao Zhang int structural_zero; 2098da112707SJunchao Zhang cusparseStatus_t status; 2099da112707SJunchao Zhang 2100da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2101d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2102da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2103da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2104da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2105da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2106da112707SJunchao Zhang } 2107da112707SJunchao Zhang 2108da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 21090dd8c0acSJunchao Zhang { 2110da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 21110dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 2112da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2113da112707SJunchao Zhang 2114da112707SJunchao Zhang Ai = Aseq->i; 2115da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2116da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 2117da112707SJunchao Zhang if (nzRow > 1) { 2118da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2119da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2120da112707SJunchao Zhang */ 2121da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 2122da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2123da112707SJunchao Zhang } 2124da112707SJunchao Zhang } 2125da112707SJunchao Zhang fs->numericFactFlops = flops; 21260dd8c0acSJunchao Zhang } 2127da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 21283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2129da112707SJunchao Zhang } 2130da112707SJunchao Zhang #endif 2131da112707SJunchao Zhang 2132d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2133d460d7bfSJunchao Zhang { 2134b820271fSJunchao Zhang // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2135b820271fSJunchao Zhang Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2136d460d7bfSJunchao Zhang 2137d460d7bfSJunchao Zhang PetscFunctionBegin; 2138d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2139d460d7bfSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2140d460d7bfSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 2141d460d7bfSJunchao Zhang 2142d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) { 2143b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2144d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2145d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2146d460d7bfSJunchao Zhang #else 2147d460d7bfSJunchao Zhang /* determine which version of MatSolve needs to be used. */ 2148d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2149d460d7bfSJunchao Zhang IS isrow = b->row, iscol = b->col; 2150d460d7bfSJunchao Zhang PetscBool row_identity, col_identity; 2151d460d7bfSJunchao Zhang 2152d460d7bfSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2153d460d7bfSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2154d460d7bfSJunchao Zhang if (row_identity && col_identity) { 2155d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2156d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2157d460d7bfSJunchao Zhang } else { 2158d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2159d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2160d460d7bfSJunchao Zhang } 2161d460d7bfSJunchao Zhang #endif 2162d460d7bfSJunchao Zhang } 2163d460d7bfSJunchao Zhang B->ops->matsolve = NULL; 2164d460d7bfSJunchao Zhang B->ops->matsolvetranspose = NULL; 2165d460d7bfSJunchao Zhang 2166d460d7bfSJunchao Zhang /* get the triangular factors */ 2167d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2168d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2169d460d7bfSJunchao Zhang } 2170d460d7bfSJunchao Zhang 2171d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2172d460d7bfSJunchao Zhang { 2173d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2174d460d7bfSJunchao Zhang 2175d460d7bfSJunchao Zhang PetscFunctionBegin; 2176d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2177d460d7bfSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2178d460d7bfSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2179d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2180d460d7bfSJunchao Zhang } 2181d460d7bfSJunchao Zhang 2182d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2183d71ae5a4SJacob Faibussowitsch { 2184da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2185da112707SJunchao Zhang 2186da112707SJunchao Zhang PetscFunctionBegin; 2187b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2188bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2189bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) { 2190da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2191da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2192bc996fdcSJunchao Zhang } 2193da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2194da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2195da112707SJunchao Zhang } else 2196da112707SJunchao Zhang #endif 2197da112707SJunchao Zhang { 2198da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2199da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2200da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2201da112707SJunchao Zhang } 22023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2203da112707SJunchao Zhang } 2204da112707SJunchao Zhang 2205d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2206d71ae5a4SJacob Faibussowitsch { 2207da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2208da112707SJunchao Zhang 2209da112707SJunchao Zhang PetscFunctionBegin; 2210b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2211bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 2212bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 2213da112707SJunchao Zhang if (!info->levels && perm_identity) { 2214da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2215da112707SJunchao Zhang } else 2216da112707SJunchao Zhang #endif 2217da112707SJunchao Zhang { 2218da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2219da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2220da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2221da112707SJunchao Zhang } 22223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2223da112707SJunchao Zhang } 2224da112707SJunchao Zhang 2225d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2226d71ae5a4SJacob Faibussowitsch { 2227da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2228da112707SJunchao Zhang 2229da112707SJunchao Zhang PetscFunctionBegin; 2230da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2231da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2232da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 22333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2234da112707SJunchao Zhang } 2235da112707SJunchao Zhang 22368eb1d50fSPierre Jolivet PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2237d71ae5a4SJacob Faibussowitsch { 2238841d4cb1SJunchao Zhang PetscFunctionBegin; 2239841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 22403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2241841d4cb1SJunchao Zhang } 2242841d4cb1SJunchao Zhang 2243841d4cb1SJunchao Zhang /*MC 2244841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 224511a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2246841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2247841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 224811a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2249841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2250841d4cb1SJunchao Zhang 2251841d4cb1SJunchao Zhang Level: beginner 2252841d4cb1SJunchao Zhang 22531cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 22542ef1f0ffSBarry Smith `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2255841d4cb1SJunchao Zhang M*/ 2256841d4cb1SJunchao Zhang 2257d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2258d71ae5a4SJacob Faibussowitsch { 2259841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2260bc996fdcSJunchao Zhang PetscBool factOnDevice, factOnHost; 2261bc996fdcSJunchao Zhang char *prefix; 2262bc996fdcSJunchao Zhang char factPlace[32] = "device"; /* the default */ 2263841d4cb1SJunchao Zhang 2264841d4cb1SJunchao Zhang PetscFunctionBegin; 2265841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2266841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2267b820271fSJunchao Zhang (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2268841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2269841d4cb1SJunchao Zhang 2270bc996fdcSJunchao Zhang prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2271bc996fdcSJunchao Zhang PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2272bc996fdcSJunchao Zhang PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2273bc996fdcSJunchao Zhang PetscOptionsEnd(); 2274bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2275bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2276bc996fdcSJunchao Zhang PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2277bc996fdcSJunchao Zhang ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2278bc996fdcSJunchao Zhang 2279841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2280841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2281841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2282841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2283841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2284841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2285841d4cb1SJunchao Zhang } else { 2286841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2287841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2288841d4cb1SJunchao Zhang } 2289841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2290841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2291841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2292841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2293841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2294841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2295841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2296841d4cb1SJunchao Zhang } else { 2297841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2298841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2299841d4cb1SJunchao Zhang } 2300841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2301841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2302841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2303841d4cb1SJunchao Zhang 2304841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2305841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2306841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 23073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2308841d4cb1SJunchao Zhang } 2309841d4cb1SJunchao Zhang 2310d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2311d71ae5a4SJacob Faibussowitsch { 23127e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 23137e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2314b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2315da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 23160dd8c0acSJunchao Zhang #endif 23177e8381f9SStefano Zampini 23187e8381f9SStefano Zampini PetscFunctionBegin; 23197e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 23209566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2321da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2322da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 23239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2324da112707SJunchao Zhang } 2325b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2326da112707SJunchao Zhang else if (fs->csrVal) { 2327da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2328da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2329da112707SJunchao Zhang } 2330da112707SJunchao Zhang #endif 23319371c9d4SSatish Balay else 23329371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 23339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 23349566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 23357e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 23367e8381f9SStefano Zampini } 23373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23387e8381f9SStefano Zampini } 23397e8381f9SStefano Zampini 2340d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2341d71ae5a4SJacob Faibussowitsch { 23427e8381f9SStefano Zampini PetscFunctionBegin; 23439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 234467a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 234667a45760SJunchao Zhang } 234767a45760SJunchao Zhang 2348d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2349d71ae5a4SJacob Faibussowitsch { 235067a45760SJunchao Zhang PetscFunctionBegin; 23517e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 235267a45760SJunchao Zhang *array = NULL; 23533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 235467a45760SJunchao Zhang } 235567a45760SJunchao Zhang 2356d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2357d71ae5a4SJacob Faibussowitsch { 235867a45760SJunchao Zhang PetscFunctionBegin; 23599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 236067a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 236267a45760SJunchao Zhang } 236367a45760SJunchao Zhang 23648eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2365d71ae5a4SJacob Faibussowitsch { 236667a45760SJunchao Zhang PetscFunctionBegin; 236767a45760SJunchao Zhang *array = NULL; 23683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 236967a45760SJunchao Zhang } 237067a45760SJunchao Zhang 2371d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2372d71ae5a4SJacob Faibussowitsch { 237367a45760SJunchao Zhang PetscFunctionBegin; 237467a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 237667a45760SJunchao Zhang } 237767a45760SJunchao Zhang 2378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2379d71ae5a4SJacob Faibussowitsch { 238067a45760SJunchao Zhang PetscFunctionBegin; 238167a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 238267a45760SJunchao Zhang *array = NULL; 23833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23847e8381f9SStefano Zampini } 23857e8381f9SStefano Zampini 2386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2387d71ae5a4SJacob Faibussowitsch { 23887ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 23897ee59b9bSJunchao Zhang CsrMatrix *matrix; 23907ee59b9bSJunchao Zhang 23917ee59b9bSJunchao Zhang PetscFunctionBegin; 23927ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23937ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 23947ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 23957ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 23967ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 23977ee59b9bSJunchao Zhang 23987ee59b9bSJunchao Zhang if (i) { 23997ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24007ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 24017ee59b9bSJunchao Zhang #else 24027ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24037ee59b9bSJunchao Zhang #endif 24047ee59b9bSJunchao Zhang } 24057ee59b9bSJunchao Zhang if (j) { 24067ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24077ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 24087ee59b9bSJunchao Zhang #else 24097ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24107ee59b9bSJunchao Zhang #endif 24117ee59b9bSJunchao Zhang } 24127ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 24137ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 24143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24157ee59b9bSJunchao Zhang } 24167ee59b9bSJunchao Zhang 2417d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2418d71ae5a4SJacob Faibussowitsch { 2419aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 24207c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 24219ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2422213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2423aa372e3fSPaul Mullowney cusparseStatus_t stat; 2424abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 24259ae82921SPaul Mullowney 24269ae82921SPaul Mullowney PetscFunctionBegin; 242728b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2428c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2429a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2430a49f1ed0SStefano Zampini CsrMatrix *matrix; 2431afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 243285ba7357SStefano Zampini 243308401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 24349566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2435afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 24369566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 24379566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 24389566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 244034d6c7a5SJose E. Roman } else { 2441abb89eb1SStefano Zampini PetscInt nnz; 24429566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 24449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 24457c700b8dSJunchao Zhang delete cusparsestruct->workVector; 244681902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2447a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2448a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 24499ae82921SPaul Mullowney try { 24509ae82921SPaul Mullowney if (a->compressedrow.use) { 24519ae82921SPaul Mullowney m = a->compressedrow.nrows; 24529ae82921SPaul Mullowney ii = a->compressedrow.i; 24539ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 24549ae82921SPaul Mullowney } else { 2455213423ffSJunchao Zhang m = A->rmap->n; 2456213423ffSJunchao Zhang ii = a->i; 2457e6e9a74fSStefano Zampini ridx = NULL; 24589ae82921SPaul Mullowney } 245908401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 24609371c9d4SSatish Balay if (!a->a) { 24619371c9d4SSatish Balay nnz = ii[m]; 24629371c9d4SSatish Balay both = PETSC_FALSE; 24639371c9d4SSatish Balay } else nnz = a->nz; 246408401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 24659ae82921SPaul Mullowney 246685ba7357SStefano Zampini /* create cusparse matrix */ 2467abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2468aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 24699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 24709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 24719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 24729ae82921SPaul Mullowney 24739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 24749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 24759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 24769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2480b06137fdSPaul Mullowney 2481aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2482aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2483aa372e3fSPaul Mullowney /* set the matrix */ 2484afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2485afb2bd1cSJunchao Zhang mat->num_rows = m; 2486afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2487abb89eb1SStefano Zampini mat->num_entries = nnz; 2488afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2489afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 24909ae82921SPaul Mullowney 2491abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2492abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2493aa372e3fSPaul Mullowney 2494abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2495abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2496aa372e3fSPaul Mullowney 2497aa372e3fSPaul Mullowney /* assign the pointer */ 2498afb2bd1cSJunchao Zhang matstruct->mat = mat; 2499afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2500afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 25019371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 25029371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 25039371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2504afb2bd1cSJunchao Zhang } 2505afb2bd1cSJunchao Zhang #endif 2506aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2507afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2508afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2509afb2bd1cSJunchao Zhang #else 2510afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2511afb2bd1cSJunchao Zhang mat->num_rows = m; 2512afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2513abb89eb1SStefano Zampini mat->num_entries = nnz; 2514afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2515afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2516aa372e3fSPaul Mullowney 2517abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2518abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2519aa372e3fSPaul Mullowney 2520abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2521abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2522aa372e3fSPaul Mullowney 2523aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 25249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 25259371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 25269371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 25279371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2528aa372e3fSPaul Mullowney /* assign the pointer */ 2529aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2530aa372e3fSPaul Mullowney 2531afb2bd1cSJunchao Zhang if (mat) { 2532afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2533afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2534afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2535afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2536087f3262SPaul Mullowney } 2537afb2bd1cSJunchao Zhang #endif 2538087f3262SPaul Mullowney } 2539ca45077fSPaul Mullowney 2540aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2541213423ffSJunchao Zhang if (a->compressedrow.use) { 2542213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2543aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2544aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2545213423ffSJunchao Zhang tmp = m; 2546213423ffSJunchao Zhang } else { 2547213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2548213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2549213423ffSJunchao Zhang tmp = 0; 2550213423ffSJunchao Zhang } 25519566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2552aa372e3fSPaul Mullowney 2553aa372e3fSPaul Mullowney /* assign the pointer */ 2554aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 2555d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 2556d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2557d71ae5a4SJacob Faibussowitsch } 25589566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 25599566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 256034d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 256134d6c7a5SJose E. Roman } 2562abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 25639ae82921SPaul Mullowney } 25643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25659ae82921SPaul Mullowney } 25669ae82921SPaul Mullowney 25679371c9d4SSatish Balay struct VecCUDAPlusEquals { 2568aa372e3fSPaul Mullowney template <typename Tuple> 2569d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2570d71ae5a4SJacob Faibussowitsch { 2571aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2572aa372e3fSPaul Mullowney } 2573aa372e3fSPaul Mullowney }; 2574aa372e3fSPaul Mullowney 25759371c9d4SSatish Balay struct VecCUDAEquals { 25767e8381f9SStefano Zampini template <typename Tuple> 2577d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2578d71ae5a4SJacob Faibussowitsch { 25797e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 25807e8381f9SStefano Zampini } 25817e8381f9SStefano Zampini }; 25827e8381f9SStefano Zampini 25839371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2584e6e9a74fSStefano Zampini template <typename Tuple> 2585d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2586d71ae5a4SJacob Faibussowitsch { 2587e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2588e6e9a74fSStefano Zampini } 2589e6e9a74fSStefano Zampini }; 2590e6e9a74fSStefano Zampini 2591afb2bd1cSJunchao Zhang struct MatMatCusparse { 2592ccdfe979SStefano Zampini PetscBool cisdense; 2593ccdfe979SStefano Zampini PetscScalar *Bt; 2594ccdfe979SStefano Zampini Mat X; 2595fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2596fcdce8c4SStefano Zampini PetscLogDouble flops; 2597fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2598b4285af6SJunchao Zhang 2599afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2600fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2601afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2602afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2603afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2604afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2605b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2606b4285af6SJunchao Zhang void *dBuffer4; 2607b4285af6SJunchao Zhang void *dBuffer5; 2608b4285af6SJunchao Zhang #endif 2609fcdce8c4SStefano Zampini size_t mmBufferSize; 2610fcdce8c4SStefano Zampini void *mmBuffer; 2611fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2612fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2613afb2bd1cSJunchao Zhang #endif 2614afb2bd1cSJunchao Zhang }; 2615ccdfe979SStefano Zampini 2616d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2617d71ae5a4SJacob Faibussowitsch { 2618ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2619ccdfe979SStefano Zampini 2620ccdfe979SStefano Zampini PetscFunctionBegin; 26219566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2622fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2623afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 26249566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 26259566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 26269566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 26279566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2628b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 26299566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 26309566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2631b4285af6SJunchao Zhang #endif 26329566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 26339566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2634afb2bd1cSJunchao Zhang #endif 26359566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 26369566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 26373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2638ccdfe979SStefano Zampini } 2639ccdfe979SStefano Zampini 26404742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2641ccdfe979SStefano Zampini 2642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2643d71ae5a4SJacob Faibussowitsch { 2644ccdfe979SStefano Zampini Mat_Product *product = C->product; 2645ccdfe979SStefano Zampini Mat A, B; 2646afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2647ccdfe979SStefano Zampini PetscBool flg, biscuda; 2648ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2649ccdfe979SStefano Zampini cusparseStatus_t stat; 2650ccdfe979SStefano Zampini cusparseOperation_t opA; 2651ccdfe979SStefano Zampini const PetscScalar *barray; 2652ccdfe979SStefano Zampini PetscScalar *carray; 2653ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2654ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2655ccdfe979SStefano Zampini CsrMatrix *csrmat; 2656ccdfe979SStefano Zampini 2657ccdfe979SStefano Zampini PetscFunctionBegin; 2658ccdfe979SStefano Zampini MatCheckProduct(C, 1); 265928b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2660ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2661ccdfe979SStefano Zampini A = product->A; 2662ccdfe979SStefano Zampini B = product->B; 26639566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 266428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2665ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2666ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 266728b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 26689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2669ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2670ccdfe979SStefano Zampini switch (product->type) { 2671ccdfe979SStefano Zampini case MATPRODUCT_AB: 2672ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2673ccdfe979SStefano Zampini mat = cusp->mat; 2674ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2675ccdfe979SStefano Zampini m = A->rmap->n; 2676ccdfe979SStefano Zampini n = B->cmap->n; 2677ccdfe979SStefano Zampini break; 2678ccdfe979SStefano Zampini case MATPRODUCT_AtB: 26791a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2680e6e9a74fSStefano Zampini mat = cusp->mat; 2681e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2682e6e9a74fSStefano Zampini } else { 26839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2684ccdfe979SStefano Zampini mat = cusp->matTranspose; 2685ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2686e6e9a74fSStefano Zampini } 2687ccdfe979SStefano Zampini m = A->cmap->n; 2688ccdfe979SStefano Zampini n = B->cmap->n; 2689ccdfe979SStefano Zampini break; 2690ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2691ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2692ccdfe979SStefano Zampini mat = cusp->mat; 2693ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2694ccdfe979SStefano Zampini m = A->rmap->n; 2695ccdfe979SStefano Zampini n = B->rmap->n; 2696ccdfe979SStefano Zampini break; 2697d71ae5a4SJacob Faibussowitsch default: 2698d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2699ccdfe979SStefano Zampini } 270028b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2701ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2702ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 27039566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 27049566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2705cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2706afb2bd1cSJunchao Zhang 27079566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2708c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2709cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 27109566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2711c8378d12SStefano Zampini } else { 2712cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 27139566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2714c8378d12SStefano Zampini } 2715c8378d12SStefano Zampini 27169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2717afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2718afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2719a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2720afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2721fcdce8c4SStefano Zampini size_t mmBufferSize; 27229371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 27239371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 27249371c9d4SSatish Balay mmdata->matBDescr = NULL; 27259371c9d4SSatish Balay } 2726afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 27279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2728afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2729afb2bd1cSJunchao Zhang } 2730c8378d12SStefano Zampini 27319371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 27329371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 27339371c9d4SSatish Balay mmdata->matCDescr = NULL; 27349371c9d4SSatish Balay } 2735afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 27369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2737afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2738afb2bd1cSJunchao Zhang } 2739afb2bd1cSJunchao Zhang 2740afb2bd1cSJunchao Zhang if (!mat->matDescr) { 27419371c9d4SSatish Balay stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 27429371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 27439371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2744afb2bd1cSJunchao Zhang } 27459371c9d4SSatish Balay stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 27469371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2747fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 27489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 27499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2750fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2751fcdce8c4SStefano Zampini } 2752afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2753afb2bd1cSJunchao Zhang } else { 2754afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 27559566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 27569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 27579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2758afb2bd1cSJunchao Zhang } 2759afb2bd1cSJunchao Zhang 2760afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 27619371c9d4SSatish Balay stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 27629371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2763afb2bd1cSJunchao Zhang #else 2764afb2bd1cSJunchao Zhang PetscInt k; 2765afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2766ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2767ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2768ccdfe979SStefano Zampini cublasStatus_t cerr; 2769ccdfe979SStefano Zampini 27709566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 27719371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 27729371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2773ccdfe979SStefano Zampini blda = B->cmap->n; 2774afb2bd1cSJunchao Zhang k = B->cmap->n; 2775afb2bd1cSJunchao Zhang } else { 2776afb2bd1cSJunchao Zhang k = B->rmap->n; 2777ccdfe979SStefano Zampini } 2778ccdfe979SStefano Zampini 2779afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 27809371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 27819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2782afb2bd1cSJunchao Zhang #endif 27839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 27849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2785cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2786ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2787cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 27884742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2789ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2790cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 27914742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2792ccdfe979SStefano Zampini } else { 2793cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2794ccdfe979SStefano Zampini } 279548a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 279648a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 27973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2798ccdfe979SStefano Zampini } 2799ccdfe979SStefano Zampini 2800d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2801d71ae5a4SJacob Faibussowitsch { 2802ccdfe979SStefano Zampini Mat_Product *product = C->product; 2803ccdfe979SStefano Zampini Mat A, B; 2804ccdfe979SStefano Zampini PetscInt m, n; 2805ccdfe979SStefano Zampini PetscBool cisdense, flg; 2806ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2807ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2808ccdfe979SStefano Zampini 2809ccdfe979SStefano Zampini PetscFunctionBegin; 2810ccdfe979SStefano Zampini MatCheckProduct(C, 1); 281128b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2812ccdfe979SStefano Zampini A = product->A; 2813ccdfe979SStefano Zampini B = product->B; 28149566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 281528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2816ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 281708401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2818ccdfe979SStefano Zampini switch (product->type) { 2819ccdfe979SStefano Zampini case MATPRODUCT_AB: 2820ccdfe979SStefano Zampini m = A->rmap->n; 2821ccdfe979SStefano Zampini n = B->cmap->n; 2822ccdfe979SStefano Zampini break; 2823ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2824ccdfe979SStefano Zampini m = A->cmap->n; 2825ccdfe979SStefano Zampini n = B->cmap->n; 2826ccdfe979SStefano Zampini break; 2827ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2828ccdfe979SStefano Zampini m = A->rmap->n; 2829ccdfe979SStefano Zampini n = B->rmap->n; 2830ccdfe979SStefano Zampini break; 2831ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2832ccdfe979SStefano Zampini m = B->cmap->n; 2833ccdfe979SStefano Zampini n = B->cmap->n; 2834ccdfe979SStefano Zampini break; 2835ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2836ccdfe979SStefano Zampini m = B->rmap->n; 2837ccdfe979SStefano Zampini n = B->rmap->n; 2838ccdfe979SStefano Zampini break; 2839d71ae5a4SJacob Faibussowitsch default: 2840d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2841ccdfe979SStefano Zampini } 28429566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2843ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 28449566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 28459566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2846ccdfe979SStefano Zampini 2847ccdfe979SStefano Zampini /* product data */ 28489566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2849ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2850afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2851afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 285248a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2853afb2bd1cSJunchao Zhang #endif 2854ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2855ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 28569566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 28579566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2858ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 28599566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2860ccdfe979SStefano Zampini } else { 28619566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2862ccdfe979SStefano Zampini } 2863ccdfe979SStefano Zampini } 2864ccdfe979SStefano Zampini C->product->data = mmdata; 2865ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2866ccdfe979SStefano Zampini 2867ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 28683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2869ccdfe979SStefano Zampini } 2870ccdfe979SStefano Zampini 2871d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2872d71ae5a4SJacob Faibussowitsch { 2873ccdfe979SStefano Zampini Mat_Product *product = C->product; 2874fcdce8c4SStefano Zampini Mat A, B; 2875fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2876fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2877fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2878fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2879fcdce8c4SStefano Zampini PetscBool flg; 2880fcdce8c4SStefano Zampini cusparseStatus_t stat; 2881fcdce8c4SStefano Zampini MatProductType ptype; 2882fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2883fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2884fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2885fcdce8c4SStefano Zampini #endif 2886b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2887ccdfe979SStefano Zampini 2888ccdfe979SStefano Zampini PetscFunctionBegin; 2889ccdfe979SStefano Zampini MatCheckProduct(C, 1); 289028b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 28919566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 289228b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2893fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2894fcdce8c4SStefano Zampini A = product->A; 2895fcdce8c4SStefano Zampini B = product->B; 2896fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2897fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2898fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 289908401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2900fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 290128b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2902fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 290328b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2904fcdce8c4SStefano Zampini goto finalize; 2905fcdce8c4SStefano Zampini } 2906fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 29079566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 290828b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 29099566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 291028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 291128b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 291228b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2913fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2914fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2915fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 291608401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 291708401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 291808401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 29199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 29209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2921fcdce8c4SStefano Zampini 2922fcdce8c4SStefano Zampini ptype = product->type; 2923b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2924fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 292528b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2926fa046f9fSJunchao Zhang } 2927b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2928fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 292928b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2930fa046f9fSJunchao Zhang } 2931fcdce8c4SStefano Zampini switch (ptype) { 2932fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2933fcdce8c4SStefano Zampini Amat = Acusp->mat; 2934fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2935fcdce8c4SStefano Zampini break; 2936fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2937fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2938fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2939fcdce8c4SStefano Zampini break; 2940fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2941fcdce8c4SStefano Zampini Amat = Acusp->mat; 2942fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2943fcdce8c4SStefano Zampini break; 2944d71ae5a4SJacob Faibussowitsch default: 2945d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2946fcdce8c4SStefano Zampini } 2947fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 294828b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 294928b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 295028b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2951fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2952fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2953fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 295428b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 295528b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 295628b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 29579566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2958fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2959fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 29609566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2961b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 29629371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29639371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2964b4285af6SJunchao Zhang #else 29659371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 29669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29679371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29689371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2969b4285af6SJunchao Zhang #endif 2970fcdce8c4SStefano Zampini #else 29719371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 29729371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 29739371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2974fcdce8c4SStefano Zampini #endif 29759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 29769566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 29779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2978fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2979fcdce8c4SStefano Zampini finalize: 2980fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 29819566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 29829566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 29839566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2984fcdce8c4SStefano Zampini c->reallocs = 0; 2985fcdce8c4SStefano Zampini C->info.mallocs += 0; 2986fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2987fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2988fcdce8c4SStefano Zampini C->num_ass++; 29893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2990ccdfe979SStefano Zampini } 2991fcdce8c4SStefano Zampini 2992d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2993d71ae5a4SJacob Faibussowitsch { 2994fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2995fcdce8c4SStefano Zampini Mat A, B; 2996fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2997fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 2998fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2999fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 3000fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 3001fcdce8c4SStefano Zampini PetscBool flg; 3002fcdce8c4SStefano Zampini cusparseStatus_t stat; 3003fcdce8c4SStefano Zampini MatProductType ptype; 3004fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3005fcdce8c4SStefano Zampini PetscLogDouble flops; 3006fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 3007fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3008fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3009fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3010fcdce8c4SStefano Zampini #else 3011fcdce8c4SStefano Zampini int cnz; 3012fcdce8c4SStefano Zampini #endif 3013b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3014fcdce8c4SStefano Zampini 3015fcdce8c4SStefano Zampini PetscFunctionBegin; 3016fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 301728b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3018fcdce8c4SStefano Zampini A = product->A; 3019fcdce8c4SStefano Zampini B = product->B; 30209566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 302128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 30229566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 302328b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3024fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 3025fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 3026fcdce8c4SStefano Zampini /* product data */ 30279566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3028fcdce8c4SStefano Zampini C->product->data = mmdata; 3029fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3030fcdce8c4SStefano Zampini 30319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 30329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3033d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3034d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 303508401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 303608401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3037d60bce21SJunchao Zhang 3038fcdce8c4SStefano Zampini ptype = product->type; 3039b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3040fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3041fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3042fa046f9fSJunchao Zhang } 3043b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3044fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3045fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3046fa046f9fSJunchao Zhang } 3047fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3048fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3049fcdce8c4SStefano Zampini switch (ptype) { 3050fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3051fcdce8c4SStefano Zampini m = A->rmap->n; 3052fcdce8c4SStefano Zampini n = B->cmap->n; 3053fcdce8c4SStefano Zampini k = A->cmap->n; 3054fcdce8c4SStefano Zampini Amat = Acusp->mat; 3055fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3056fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3057fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3058fcdce8c4SStefano Zampini break; 3059fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3060fcdce8c4SStefano Zampini m = A->cmap->n; 3061fcdce8c4SStefano Zampini n = B->cmap->n; 3062fcdce8c4SStefano Zampini k = A->rmap->n; 30639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3064fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3065fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3066fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3067fcdce8c4SStefano Zampini break; 3068fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3069fcdce8c4SStefano Zampini m = A->rmap->n; 3070fcdce8c4SStefano Zampini n = B->rmap->n; 3071fcdce8c4SStefano Zampini k = A->cmap->n; 30729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3073fcdce8c4SStefano Zampini Amat = Acusp->mat; 3074fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3075fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3076fcdce8c4SStefano Zampini break; 3077d71ae5a4SJacob Faibussowitsch default: 3078d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3079fcdce8c4SStefano Zampini } 3080fcdce8c4SStefano Zampini 3081fcdce8c4SStefano Zampini /* create cusparse matrix */ 30829566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 30839566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3084fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 3085fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3086fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3087fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3088fcdce8c4SStefano Zampini 3089fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3090fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3091fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 30929566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 30939566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3094fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3095fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3096fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3097fcdce8c4SStefano Zampini } else { 3098fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3099fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3100fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3101fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3102fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3103fcdce8c4SStefano Zampini } 3104fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3105fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3106fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3107fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3108fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3109fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 31109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 31119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 31129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 31139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 31149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 31159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 31169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3119fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3120d460d7bfSJunchao Zhang PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3121fcdce8c4SStefano Zampini c->nz = 0; 3122fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3123fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3124fcdce8c4SStefano Zampini goto finalizesym; 3125fcdce8c4SStefano Zampini } 3126fcdce8c4SStefano Zampini 312728b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 312828b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3129fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 3130fcdce8c4SStefano Zampini if (!biscompressed) { 3131fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 3132fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3133fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3134fcdce8c4SStefano Zampini #endif 3135fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3136fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3137fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3138fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3139fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3140fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3141fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3142fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3143fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3144fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3145fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 31469566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3147fcdce8c4SStefano Zampini } 3148fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3149fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3150fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3151fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 31529371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 31539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3154fcdce8c4SStefano Zampini } 3155fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3156fcdce8c4SStefano Zampini #endif 3157fcdce8c4SStefano Zampini } 315828b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 315928b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3160fcdce8c4SStefano Zampini /* precompute flops count */ 3161fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3162fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3163fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3164fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 3165fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 3166fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3167fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 3168fcdce8c4SStefano Zampini } 3169fcdce8c4SStefano Zampini } 3170fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3171fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3172fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 3173fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3174fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 3175fcdce8c4SStefano Zampini } 3176fcdce8c4SStefano Zampini } else { /* TODO */ 3177fcdce8c4SStefano Zampini flops = 0.; 3178fcdce8c4SStefano Zampini } 3179fcdce8c4SStefano Zampini 3180fcdce8c4SStefano Zampini mmdata->flops = flops; 31819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3182b4285af6SJunchao Zhang 3183fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 31849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3185*1ffab3bdSJunchao Zhang // cuda-12.2 requires non-null csrRowOffsets 3186*1ffab3bdSJunchao Zhang stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 31879371c9d4SSatish Balay PetscCallCUSPARSE(stat); 31889566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3189b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3190b4285af6SJunchao Zhang { 3191b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3192b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3193b4285af6SJunchao Zhang */ 3194b4285af6SJunchao Zhang void *dBuffer1 = NULL; 3195b4285af6SJunchao Zhang void *dBuffer2 = NULL; 3196b4285af6SJunchao Zhang void *dBuffer3 = NULL; 3197b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3198b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3199b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3200b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3201b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3202b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3203b4285af6SJunchao Zhang 3204b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 32059371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 32069371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3208b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 32099371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 32109371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3211b4285af6SJunchao Zhang 32129371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 32139371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 32159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 32169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 32179371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 32189371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32199566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 32209566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3221b4285af6SJunchao Zhang 3222b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 32239566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3224b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 3225b4285af6SJunchao Zhang /* allocate matrix C */ 32269371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32279371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32289371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 32299371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3230b4285af6SJunchao Zhang /* update matC with the new pointers */ 32319371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32329371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3233b4285af6SJunchao Zhang 32349371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 32359371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 32379371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 32389371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 32409371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32419371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32429566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3243b4285af6SJunchao Zhang } 3244ae37ee31SJunchao Zhang #else 3245b4285af6SJunchao Zhang size_t bufSize2; 3246fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 32479371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 32489371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3250fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 32519371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 32529371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3253fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 32549371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 32559371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3256fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3257fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3258fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3259fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3260fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 32619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3262fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 32639371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 32649371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3265fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 32669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3267fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 32689371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 32699371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3270fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32719566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3272fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 32739566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32749371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32759371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32769371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3278ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3279fcdce8c4SStefano Zampini #else 32809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 32819371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 32829371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 32839371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3284fcdce8c4SStefano Zampini c->nz = cnz; 3285fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32869566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3287fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 32889566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3289fcdce8c4SStefano Zampini 32909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3291fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3292fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3293fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 32949371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 32959371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 32969371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3297fcdce8c4SStefano Zampini #endif 32989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 32999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3300fcdce8c4SStefano Zampini finalizesym: 3301fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3302fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3303fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 33049566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 33059566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 33067de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3307fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3308fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3309fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3310fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3311fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3312fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3315fcdce8c4SStefano Zampini } else { 3316fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3317fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3320fcdce8c4SStefano Zampini } 3321fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3322fcdce8c4SStefano Zampini PetscInt r = 0; 3323fcdce8c4SStefano Zampini c->i[0] = 0; 3324fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3325fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3326fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3327fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3328fcdce8c4SStefano Zampini } 3329fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3330fcdce8c4SStefano Zampini } 33319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 33329566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 33339566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3334fcdce8c4SStefano Zampini c->maxnz = c->nz; 3335fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3336fcdce8c4SStefano Zampini c->rmax = 0; 3337fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3338fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3339fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3340fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 3341fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3342fcdce8c4SStefano Zampini } 33439566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 33449566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3345fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3346fcdce8c4SStefano Zampini 3347fcdce8c4SStefano Zampini C->nonzerostate++; 33489566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 33499566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3350fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3351fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3352fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3353fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3354fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3355abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3356fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3357fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3358fcdce8c4SStefano Zampini } 3359fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 33603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3361fcdce8c4SStefano Zampini } 3362fcdce8c4SStefano Zampini 3363fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3364fcdce8c4SStefano Zampini 3365fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3366d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3367d71ae5a4SJacob Faibussowitsch { 3368fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3369fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3370fcdce8c4SStefano Zampini 3371fcdce8c4SStefano Zampini PetscFunctionBegin; 3372fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 33739566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 337448a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3375fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3376fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 337748a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3378fcdce8c4SStefano Zampini } 337965e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 338065e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 338165e4b4d4SStefano Zampini switch (product->type) { 338265e4b4d4SStefano Zampini case MATPRODUCT_AB: 338365e4b4d4SStefano Zampini if (product->api_user) { 3384d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 33859566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3386d0609cedSBarry Smith PetscOptionsEnd(); 338765e4b4d4SStefano Zampini } else { 3388d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 33899566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3390d0609cedSBarry Smith PetscOptionsEnd(); 339165e4b4d4SStefano Zampini } 339265e4b4d4SStefano Zampini break; 339365e4b4d4SStefano Zampini case MATPRODUCT_AtB: 339465e4b4d4SStefano Zampini if (product->api_user) { 3395d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 33969566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3397d0609cedSBarry Smith PetscOptionsEnd(); 339865e4b4d4SStefano Zampini } else { 3399d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 34009566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3401d0609cedSBarry Smith PetscOptionsEnd(); 340265e4b4d4SStefano Zampini } 340365e4b4d4SStefano Zampini break; 340465e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 340565e4b4d4SStefano Zampini if (product->api_user) { 3406d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 34079566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3408d0609cedSBarry Smith PetscOptionsEnd(); 340965e4b4d4SStefano Zampini } else { 3410d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 34119566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3412d0609cedSBarry Smith PetscOptionsEnd(); 341365e4b4d4SStefano Zampini } 341465e4b4d4SStefano Zampini break; 341565e4b4d4SStefano Zampini case MATPRODUCT_RARt: 341665e4b4d4SStefano Zampini if (product->api_user) { 3417d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 34189566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3419d0609cedSBarry Smith PetscOptionsEnd(); 342065e4b4d4SStefano Zampini } else { 3421d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 34229566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3423d0609cedSBarry Smith PetscOptionsEnd(); 342465e4b4d4SStefano Zampini } 342565e4b4d4SStefano Zampini break; 342665e4b4d4SStefano Zampini case MATPRODUCT_ABC: 342765e4b4d4SStefano Zampini if (product->api_user) { 3428d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 34299566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3430d0609cedSBarry Smith PetscOptionsEnd(); 343165e4b4d4SStefano Zampini } else { 3432d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 34339566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3434d0609cedSBarry Smith PetscOptionsEnd(); 343565e4b4d4SStefano Zampini } 343665e4b4d4SStefano Zampini break; 3437d71ae5a4SJacob Faibussowitsch default: 3438d71ae5a4SJacob Faibussowitsch break; 343965e4b4d4SStefano Zampini } 344065e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 344165e4b4d4SStefano Zampini } 344265e4b4d4SStefano Zampini /* dispatch */ 3443fcdce8c4SStefano Zampini if (isdense) { 3444ccdfe979SStefano Zampini switch (product->type) { 3445ccdfe979SStefano Zampini case MATPRODUCT_AB: 3446ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3447ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3448ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3449ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3450fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 34519566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3452fcdce8c4SStefano Zampini } else { 3453fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3454fcdce8c4SStefano Zampini } 3455fcdce8c4SStefano Zampini break; 3456d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3457d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3458d71ae5a4SJacob Faibussowitsch break; 3459d71ae5a4SJacob Faibussowitsch default: 3460d71ae5a4SJacob Faibussowitsch break; 3461ccdfe979SStefano Zampini } 3462fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3463fcdce8c4SStefano Zampini switch (product->type) { 3464fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3465fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3466d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 3467d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3468d71ae5a4SJacob Faibussowitsch break; 3469fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3470fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3471d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3472d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3473d71ae5a4SJacob Faibussowitsch break; 3474d71ae5a4SJacob Faibussowitsch default: 3475d71ae5a4SJacob Faibussowitsch break; 3476fcdce8c4SStefano Zampini } 3477fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 34789566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3479fcdce8c4SStefano Zampini } 34803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3481ccdfe979SStefano Zampini } 3482ccdfe979SStefano Zampini 3483d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3484d71ae5a4SJacob Faibussowitsch { 34859ae82921SPaul Mullowney PetscFunctionBegin; 34869566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 34873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3488e6e9a74fSStefano Zampini } 3489e6e9a74fSStefano Zampini 3490d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3491d71ae5a4SJacob Faibussowitsch { 3492e6e9a74fSStefano Zampini PetscFunctionBegin; 34939566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 34943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3495e6e9a74fSStefano Zampini } 3496e6e9a74fSStefano Zampini 3497d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3498d71ae5a4SJacob Faibussowitsch { 3499e6e9a74fSStefano Zampini PetscFunctionBegin; 35009566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 35013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3502e6e9a74fSStefano Zampini } 3503e6e9a74fSStefano Zampini 3504d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3505d71ae5a4SJacob Faibussowitsch { 3506e6e9a74fSStefano Zampini PetscFunctionBegin; 35079566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 35083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 35099ae82921SPaul Mullowney } 35109ae82921SPaul Mullowney 3511d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3512d71ae5a4SJacob Faibussowitsch { 3513ca45077fSPaul Mullowney PetscFunctionBegin; 35149566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 35153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3516ca45077fSPaul Mullowney } 3517ca45077fSPaul Mullowney 3518d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3519d71ae5a4SJacob Faibussowitsch { 3520a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3521a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3522a0e72f99SJunchao Zhang } 3523a0e72f99SJunchao Zhang 3524afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3525d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3526d71ae5a4SJacob Faibussowitsch { 35279ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3528aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 35299ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3530e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3531e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3532e6e9a74fSStefano Zampini PetscBool compressed; 3533afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3534afb2bd1cSJunchao Zhang PetscInt nx, ny; 3535afb2bd1cSJunchao Zhang #endif 35366e111a19SKarl Rupp 35379ae82921SPaul Mullowney PetscFunctionBegin; 353808401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3539cbc6b225SStefano Zampini if (!a->nz) { 3540995bce04SJacob Faibussowitsch if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3541995bce04SJacob Faibussowitsch else PetscCall(VecSeq_CUDA::Set(zz, 0)); 35423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3543e6e9a74fSStefano Zampini } 354434d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 35459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3546e6e9a74fSStefano Zampini if (!trans) { 35479ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 35485f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3549e6e9a74fSStefano Zampini } else { 35501a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3551e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3552e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3553e6e9a74fSStefano Zampini } else { 35549566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3555e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3556e6e9a74fSStefano Zampini } 3557e6e9a74fSStefano Zampini } 3558e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3559e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3560213423ffSJunchao Zhang 3561e6e9a74fSStefano Zampini try { 35629566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 356369d47153SPierre Jolivet if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 35649566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3565afb2bd1cSJunchao Zhang 35669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3567e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3568afb2bd1cSJunchao Zhang /* z = A x + beta y. 3569afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3570afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3571afb2bd1cSJunchao Zhang */ 3572e6e9a74fSStefano Zampini xptr = xarray; 3573afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3574213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3575afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3576afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3577afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3578afb2bd1cSJunchao Zhang */ 3579afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3580afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3581afb2bd1cSJunchao Zhang nx = mat->num_cols; 3582afb2bd1cSJunchao Zhang ny = mat->num_rows; 3583afb2bd1cSJunchao Zhang } 3584afb2bd1cSJunchao Zhang #endif 3585e6e9a74fSStefano Zampini } else { 3586afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3587afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3588afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3589afb2bd1cSJunchao Zhang */ 3590afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3591e6e9a74fSStefano Zampini dptr = zarray; 3592e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3593afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3594e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3595d0967f54SJacob Faibussowitsch 3596d0967f54SJacob Faibussowitsch thrust::for_each( 3597d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3598d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3599d0967f54SJacob Faibussowitsch #endif 3600d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 36019371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3602e6e9a74fSStefano Zampini } 3603afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3604afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3605afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3606afb2bd1cSJunchao Zhang nx = mat->num_rows; 3607afb2bd1cSJunchao Zhang ny = mat->num_cols; 3608afb2bd1cSJunchao Zhang } 3609afb2bd1cSJunchao Zhang #endif 3610e6e9a74fSStefano Zampini } 36119ae82921SPaul Mullowney 3612afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3613aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3614afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 36155f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3616afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 36179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 36189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 36199371c9d4SSatish Balay PetscCallCUSPARSE( 36209371c9d4SSatish Balay cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 36219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3622afb2bd1cSJunchao Zhang 3623afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3624afb2bd1cSJunchao Zhang } else { 3625afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 36269566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 36279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3628afb2bd1cSJunchao Zhang } 3629afb2bd1cSJunchao Zhang 36309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 36319371c9d4SSatish Balay matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3632afb2bd1cSJunchao Zhang #else 36337656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 36349371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3635afb2bd1cSJunchao Zhang #endif 3636aa372e3fSPaul Mullowney } else { 3637213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3638afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3639afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3640afb2bd1cSJunchao Zhang #else 3641301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 36429371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3643afb2bd1cSJunchao Zhang #endif 3644a65300a6SPaul Mullowney } 3645aa372e3fSPaul Mullowney } 36469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3647aa372e3fSPaul Mullowney 3648e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3649213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3650213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3651995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3652e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3653995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 36547656d835SStefano Zampini } 3655213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3656995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Set(zz, 0)); 36577656d835SStefano Zampini } 36587656d835SStefano Zampini 3659213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3660213423ffSJunchao Zhang if (compressed) { 36619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3662da81f932SPierre Jolivet /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered) 3663a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3664a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3665a0e72f99SJunchao Zhang */ 3666a0e72f99SJunchao Zhang #if 0 3667a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3668a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3669a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3670e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3671c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3672a0e72f99SJunchao Zhang #else 3673a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3674a0e72f99SJunchao Zhang ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3675a0e72f99SJunchao Zhang #endif 36769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3677e6e9a74fSStefano Zampini } 3678e6e9a74fSStefano Zampini } else { 3679995bce04SJacob Faibussowitsch if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3680e6e9a74fSStefano Zampini } 36819566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 36829566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 36839566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3684d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 3685d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3686d71ae5a4SJacob Faibussowitsch } 3687e6e9a74fSStefano Zampini if (yy) { 36889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3689e6e9a74fSStefano Zampini } else { 36909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3691e6e9a74fSStefano Zampini } 36923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 36939ae82921SPaul Mullowney } 36949ae82921SPaul Mullowney 3695d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3696d71ae5a4SJacob Faibussowitsch { 3697ca45077fSPaul Mullowney PetscFunctionBegin; 36989566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 36993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3700ca45077fSPaul Mullowney } 3701ca45077fSPaul Mullowney 3702d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3703d71ae5a4SJacob Faibussowitsch { 3704042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3705042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 37063fa6b06aSMark Adams 3707042217e8SBarry Smith PetscFunctionBegin; 37089566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3709042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 37109566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 37119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3712042217e8SBarry Smith cusp->deviceMat = NULL; 3713042217e8SBarry Smith } 37143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37159ae82921SPaul Mullowney } 37169ae82921SPaul Mullowney 3717e057df02SPaul Mullowney /*@ 371811a5261eSBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 3719e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 372011a5261eSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 3721e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 372220f4b53cSBarry Smith the parameter `nz` (or the array `nnz`). 37239ae82921SPaul Mullowney 3724d083f849SBarry Smith Collective 37259ae82921SPaul Mullowney 37269ae82921SPaul Mullowney Input Parameters: 372711a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 37289ae82921SPaul Mullowney . m - number of rows 37299ae82921SPaul Mullowney . n - number of columns 373020f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 373120f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 37329ae82921SPaul Mullowney 37339ae82921SPaul Mullowney Output Parameter: 37349ae82921SPaul Mullowney . A - the matrix 37359ae82921SPaul Mullowney 37362ef1f0ffSBarry Smith Level: intermediate 37372ef1f0ffSBarry Smith 37382ef1f0ffSBarry Smith Notes: 373911a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 37409ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 374111a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 37429ae82921SPaul Mullowney 374311a5261eSBarry Smith The AIJ format, also called 37442ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 37459ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 374620f4b53cSBarry Smith either one (as in Fortran) or zero. 37479ae82921SPaul Mullowney 37489ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 37492ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 375020f4b53cSBarry Smith allocation. 37519ae82921SPaul Mullowney 37521cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 37539ae82921SPaul Mullowney @*/ 3754d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3755d71ae5a4SJacob Faibussowitsch { 37569ae82921SPaul Mullowney PetscFunctionBegin; 37579566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 37589566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 37599566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 37609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 37613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37629ae82921SPaul Mullowney } 37639ae82921SPaul Mullowney 3764d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3765d71ae5a4SJacob Faibussowitsch { 37669ae82921SPaul Mullowney PetscFunctionBegin; 37679ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 37689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 37699ae82921SPaul Mullowney } else { 37709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3771aa372e3fSPaul Mullowney } 37729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 37739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 37749566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 37759566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 37769566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 37779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 37789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 37799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 37809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 37819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 37829566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 37833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37849ae82921SPaul Mullowney } 37859ae82921SPaul Mullowney 3786ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 378795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3788d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3789d71ae5a4SJacob Faibussowitsch { 37909ff858a8SKarl Rupp PetscFunctionBegin; 37919566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 37929566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 37933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37949ff858a8SKarl Rupp } 37959ff858a8SKarl Rupp 3796d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3797d71ae5a4SJacob Faibussowitsch { 3798a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3799039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3800039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3801039c6fbaSStefano Zampini PetscScalar *ay; 3802039c6fbaSStefano Zampini const PetscScalar *ax; 3803039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3804e6e9a74fSStefano Zampini 380595639643SRichard Tran Mills PetscFunctionBegin; 3806a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3807a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3808039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 38099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 38109566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 38113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 381295639643SRichard Tran Mills } 3813039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 38149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 38159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 38165f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 38175f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3818039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3819039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3820039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3821039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3822039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3823ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3824039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3825039c6fbaSStefano Zampini } 3826d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3827d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3828039c6fbaSStefano Zampini 3829039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3830039c6fbaSStefano Zampini PetscScalar b = 1.0; 3831039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3832039c6fbaSStefano Zampini size_t bufferSize; 3833039c6fbaSStefano Zampini void *buffer; 3834039c6fbaSStefano Zampini #endif 3835039c6fbaSStefano Zampini 38369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3839039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 38409371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38419371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 38429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 38439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38449371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38459371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 38469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3849039c6fbaSStefano Zampini #else 38509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38519371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38529371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 38539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3855039c6fbaSStefano Zampini #endif 38569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 38579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3860039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3861a587d139SMark cublasHandle_t cublasv2handle; 3862a587d139SMark PetscBLASInt one = 1, bnz = 1; 3863039c6fbaSStefano Zampini 38649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38669566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 38679566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 38689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38699566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 38709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 38719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3875039c6fbaSStefano Zampini } else { 38769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 38779566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3878a587d139SMark } 38793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 388095639643SRichard Tran Mills } 388195639643SRichard Tran Mills 3882d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3883d71ae5a4SJacob Faibussowitsch { 388433c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 388533c9ba73SStefano Zampini PetscScalar *ay; 388633c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 388733c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 388833c9ba73SStefano Zampini 388933c9ba73SStefano Zampini PetscFunctionBegin; 38909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38919566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 38929566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 38939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38949566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 38959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 38969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 38993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 390033c9ba73SStefano Zampini } 390133c9ba73SStefano Zampini 3902d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3903d71ae5a4SJacob Faibussowitsch { 39047e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3905a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 39067e8381f9SStefano Zampini 39073fa6b06aSMark Adams PetscFunctionBegin; 39083fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 39093fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 39107e8381f9SStefano Zampini if (spptr->mat) { 39117e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 39127e8381f9SStefano Zampini if (matrix->values) { 39137e8381f9SStefano Zampini both = PETSC_TRUE; 39147e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39157e8381f9SStefano Zampini } 39167e8381f9SStefano Zampini } 39177e8381f9SStefano Zampini if (spptr->matTranspose) { 39187e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3919ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39207e8381f9SStefano Zampini } 39213fa6b06aSMark Adams } 39229566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 39239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 39247e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3925a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 39263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39273fa6b06aSMark Adams } 39283fa6b06aSMark Adams 3929d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3930d71ae5a4SJacob Faibussowitsch { 3931a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3932a587d139SMark 3933a587d139SMark PetscFunctionBegin; 39349a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 39359a14fc28SStefano Zampini A->boundtocpu = flg; 39363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39379a14fc28SStefano Zampini } 3938a587d139SMark if (flg) { 39399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3940a587d139SMark 394133c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3942a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3943a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3944a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3945a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3946a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3947a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3948a587d139SMark A->ops->multhermitiantranspose = NULL; 3949a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3950fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 39519566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 39529566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 39539566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 39549566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 39559566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 39569566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 39579566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3958a587d139SMark } else { 395933c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3960a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3961a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3962a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3963a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3964a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3965a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3966a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3967a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3968fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 396967a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 397067a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 397167a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 397267a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 397367a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 397467a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 39757ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 39767ee59b9bSJunchao Zhang 39779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 39789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 39799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 39809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 39819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 39829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3983a587d139SMark } 3984a587d139SMark A->boundtocpu = flg; 3985ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3986ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3987ea500dcfSRichard Tran Mills } else { 3988ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3989ea500dcfSRichard Tran Mills } 39903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3991a587d139SMark } 3992a587d139SMark 39938eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 3994d71ae5a4SJacob Faibussowitsch { 399549735bf3SStefano Zampini Mat B; 39969ae82921SPaul Mullowney 39979ae82921SPaul Mullowney PetscFunctionBegin; 39989566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 399949735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 40009566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 400149735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 40029566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 400349735bf3SStefano Zampini } 400449735bf3SStefano Zampini B = *newmat; 400549735bf3SStefano Zampini 40069566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 40079566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 400834136279SStefano Zampini 400949735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 40109ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4011e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 40129566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 40151a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4016d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4017b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4018a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4019a435da06SStefano Zampini #else 4020d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4021a435da06SStefano Zampini #endif 4022d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4023d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4024d8132acaSStefano Zampini #endif 40251a2c6b5cSJunchao Zhang B->spptr = spptr; 40269ae82921SPaul Mullowney } else { 4027e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4028e6e9a74fSStefano Zampini 40299566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4032e6e9a74fSStefano Zampini B->spptr = spptr; 40339ae82921SPaul Mullowney } 4034e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 403549735bf3SStefano Zampini } 4036693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 40379ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 40381a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 40399ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 404095639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4041693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 40422205254eSKarl Rupp 40439566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 40449566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 40459566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4046ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 40479566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4048ae48a8d0SStefano Zampini #endif 40499566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 40503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40519ae82921SPaul Mullowney } 40529ae82921SPaul Mullowney 4053d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4054d71ae5a4SJacob Faibussowitsch { 405502fe1965SBarry Smith PetscFunctionBegin; 40569566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 40579566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 40583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 405902fe1965SBarry Smith } 406002fe1965SBarry Smith 40613ca39a21SBarry Smith /*MC 4062e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 4063e057df02SPaul Mullowney 406411a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 406511a5261eSBarry Smith CSR, ELL, or Hybrid format. 406611a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 4067e057df02SPaul Mullowney 4068e057df02SPaul Mullowney Options Database Keys: 406911a5261eSBarry Smith + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 40702ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 40712ef1f0ffSBarry Smith Other options include ell (ellpack) or hyb (hybrid). 40722ef1f0ffSBarry Smith . -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 40732ef1f0ffSBarry Smith - -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU 4074e057df02SPaul Mullowney 4075e057df02SPaul Mullowney Level: beginner 4076e057df02SPaul Mullowney 40771cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4078e057df02SPaul Mullowney M*/ 40797f756511SDominic Meiser 4080bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 40810f39cd5aSBarry Smith 4082d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4083d71ae5a4SJacob Faibussowitsch { 408442c9c57cSBarry Smith PetscFunctionBegin; 40859566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 40869566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 40879566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 40889566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 40899566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4090bddcd29dSMark Adams 40913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 409242c9c57cSBarry Smith } 409329b38603SBarry Smith 4094d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) 4095d71ae5a4SJacob Faibussowitsch { 4096cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 4097cbc6b225SStefano Zampini 4098cbc6b225SStefano Zampini PetscFunctionBegin; 40993ba16761SJacob Faibussowitsch if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4100cbc6b225SStefano Zampini delete cusp->cooPerm; 4101cbc6b225SStefano Zampini delete cusp->cooPerm_a; 4102cbc6b225SStefano Zampini cusp->cooPerm = NULL; 4103cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 4104cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 41059566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 41069566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 4107cbc6b225SStefano Zampini } 4108cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 41093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4110cbc6b225SStefano Zampini } 4111cbc6b225SStefano Zampini 4112d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 4113d71ae5a4SJacob Faibussowitsch { 41147f756511SDominic Meiser PetscFunctionBegin; 41157f756511SDominic Meiser if (*cusparsestruct) { 41169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 41179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 41187f756511SDominic Meiser delete (*cusparsestruct)->workVector; 411981902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 41207e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 41217e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 4122a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 41239566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 41249566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 41259566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 41269566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 41277f756511SDominic Meiser } 41283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41297f756511SDominic Meiser } 41307f756511SDominic Meiser 4131d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4132d71ae5a4SJacob Faibussowitsch { 41337f756511SDominic Meiser PetscFunctionBegin; 41347f756511SDominic Meiser if (*mat) { 41357f756511SDominic Meiser delete (*mat)->values; 41367f756511SDominic Meiser delete (*mat)->column_indices; 41377f756511SDominic Meiser delete (*mat)->row_offsets; 41387f756511SDominic Meiser delete *mat; 41397f756511SDominic Meiser *mat = 0; 41407f756511SDominic Meiser } 41413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41427f756511SDominic Meiser } 41437f756511SDominic Meiser 4144b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4145d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4146d71ae5a4SJacob Faibussowitsch { 41477f756511SDominic Meiser PetscFunctionBegin; 41487f756511SDominic Meiser if (*trifactor) { 41499566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4150261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 41519566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 41529566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 41539566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4154afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 41559566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4156afb2bd1cSJunchao Zhang #endif 41579566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 41587f756511SDominic Meiser } 41593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41607f756511SDominic Meiser } 4161d460d7bfSJunchao Zhang #endif 41627f756511SDominic Meiser 4163d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4164d71ae5a4SJacob Faibussowitsch { 41657f756511SDominic Meiser CsrMatrix *mat; 41667f756511SDominic Meiser 41677f756511SDominic Meiser PetscFunctionBegin; 41687f756511SDominic Meiser if (*matstruct) { 41697f756511SDominic Meiser if ((*matstruct)->mat) { 41707f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4171afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4172afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4173afb2bd1cSJunchao Zhang #else 41747f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 41759566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4176afb2bd1cSJunchao Zhang #endif 41777f756511SDominic Meiser } else { 41787f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 41793ba16761SJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&mat)); 41807f756511SDominic Meiser } 41817f756511SDominic Meiser } 41829566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 41837f756511SDominic Meiser delete (*matstruct)->cprowIndices; 41849566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 41859566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 41869566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4187afb2bd1cSJunchao Zhang 4188afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4189afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 41909566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4191afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 4192afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 41939566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 41949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 41959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4196afb2bd1cSJunchao Zhang } 4197afb2bd1cSJunchao Zhang } 4198afb2bd1cSJunchao Zhang #endif 41997f756511SDominic Meiser delete *matstruct; 42007e8381f9SStefano Zampini *matstruct = NULL; 42017f756511SDominic Meiser } 42023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42037f756511SDominic Meiser } 42047f756511SDominic Meiser 4205d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4206d71ae5a4SJacob Faibussowitsch { 4207da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4208da112707SJunchao Zhang 42097f756511SDominic Meiser PetscFunctionBegin; 4210da112707SJunchao Zhang if (fs) { 4211b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4212da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4213da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4214da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4215da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4216d460d7bfSJunchao Zhang delete fs->workVector; 4217d460d7bfSJunchao Zhang fs->workVector = NULL; 4218d460d7bfSJunchao Zhang #endif 4219da112707SJunchao Zhang delete fs->rpermIndices; 4220da112707SJunchao Zhang delete fs->cpermIndices; 4221da112707SJunchao Zhang fs->rpermIndices = NULL; 4222da112707SJunchao Zhang fs->cpermIndices = NULL; 4223da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 4224da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 4225da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4226b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4227da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4228da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 422930807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 423030807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4231da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4232d460d7bfSJunchao Zhang PetscCallCUDA(cudaFree(fs->diag)); 4233da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4234da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 423512ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4236da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4237da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 423812ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4239da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4240da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4241da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4242da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4243da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4244da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4245da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4246da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4247da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4248da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4249da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4250da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4251d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrRowPtr_h)); 4252d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrVal_h)); 4253d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->diag_h)); 425412ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 425512ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4256da112707SJunchao Zhang #endif 4257ccdfe979SStefano Zampini } 42583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4259ccdfe979SStefano Zampini } 4260ccdfe979SStefano Zampini 4261d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4262d71ae5a4SJacob Faibussowitsch { 4263ccdfe979SStefano Zampini PetscFunctionBegin; 4264ccdfe979SStefano Zampini if (*trifactors) { 42659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4266f0173cd6SStefano Zampini PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 42679566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 42687f756511SDominic Meiser } 42693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42707f756511SDominic Meiser } 42717e8381f9SStefano Zampini 42729371c9d4SSatish Balay struct IJCompare { 4273d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4274d71ae5a4SJacob Faibussowitsch { 42757e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 42767e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 42777e8381f9SStefano Zampini return false; 42787e8381f9SStefano Zampini } 42797e8381f9SStefano Zampini }; 42807e8381f9SStefano Zampini 42819371c9d4SSatish Balay struct IJEqual { 4282d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4283d71ae5a4SJacob Faibussowitsch { 42847e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 42857e8381f9SStefano Zampini return true; 42867e8381f9SStefano Zampini } 42877e8381f9SStefano Zampini }; 42887e8381f9SStefano Zampini 42899371c9d4SSatish Balay struct IJDiff { 42909371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 42917e8381f9SStefano Zampini }; 42927e8381f9SStefano Zampini 42939371c9d4SSatish Balay struct IJSum { 42949371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 42957e8381f9SStefano Zampini }; 42967e8381f9SStefano Zampini 42977e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 4298219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 4299d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 4300d71ae5a4SJacob Faibussowitsch { 43017e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4302fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4303bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 430408391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 43057e8381f9SStefano Zampini CsrMatrix *matrix; 43067e8381f9SStefano Zampini PetscInt n; 43077e8381f9SStefano Zampini 43087e8381f9SStefano Zampini PetscFunctionBegin; 430928b400f6SJacob Faibussowitsch PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 431028b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 43117e8381f9SStefano Zampini if (!cusp->cooPerm) { 43129566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 43139566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 43143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43157e8381f9SStefano Zampini } 43167e8381f9SStefano Zampini matrix = (CsrMatrix *)cusp->mat->mat; 431728b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4318e61fc153SStefano Zampini if (!v) { 4319e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4320e61fc153SStefano Zampini goto finalize; 43217e8381f9SStefano Zampini } 4322e61fc153SStefano Zampini n = cusp->cooPerm->size(); 432308391a17SStefano Zampini if (isCudaMem(v)) { 432408391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 432508391a17SStefano Zampini } else { 4326e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4327e61fc153SStefano Zampini cooPerm_v->assign(v, v + n); 432808391a17SStefano Zampini d_v = cooPerm_v->data(); 43299566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 433008391a17SStefano Zampini } 43319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4332e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4333ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4334bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 433508391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4336ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4337ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4338ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4339ddea5d60SJunchao Zhang */ 4340e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4341e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4342e61fc153SStefano Zampini delete cooPerm_w; 43437e8381f9SStefano Zampini } else { 4344ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 43459371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 43469371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4347ddea5d60SJunchao Zhang thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 43487e8381f9SStefano Zampini } 43497e8381f9SStefano Zampini } else { 4350e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 435108391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4352e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 43537e8381f9SStefano Zampini } else { 43549371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 43559371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 43567e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 43577e8381f9SStefano Zampini } 43587e8381f9SStefano Zampini } 43599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4360e61fc153SStefano Zampini finalize: 4361e61fc153SStefano Zampini delete cooPerm_v; 43627e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 43639566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4364fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 43659566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 43669566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 43679566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4368fcdce8c4SStefano Zampini a->reallocs = 0; 4369fcdce8c4SStefano Zampini A->info.mallocs += 0; 4370fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4371fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4372fcdce8c4SStefano Zampini A->num_ass++; 43733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43747e8381f9SStefano Zampini } 43757e8381f9SStefano Zampini 4376d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4377d71ae5a4SJacob Faibussowitsch { 4378a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4379a49f1ed0SStefano Zampini 4380a49f1ed0SStefano Zampini PetscFunctionBegin; 4381a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 43823ba16761SJacob Faibussowitsch if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4383a49f1ed0SStefano Zampini if (destroy) { 43849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4385a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4386a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4387a49f1ed0SStefano Zampini } 43881a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 43893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4390a49f1ed0SStefano Zampini } 4391a49f1ed0SStefano Zampini 43927e8381f9SStefano Zampini #include <thrust/binary_search.h> 4393219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 4394d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 4395d71ae5a4SJacob Faibussowitsch { 43967e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 43977e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 43987e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 43997e8381f9SStefano Zampini 44007e8381f9SStefano Zampini PetscFunctionBegin; 44019566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 44029566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 44037e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 44047e8381f9SStefano Zampini if (n != cooPerm_n) { 44057e8381f9SStefano Zampini delete cusp->cooPerm; 44067e8381f9SStefano Zampini delete cusp->cooPerm_a; 44077e8381f9SStefano Zampini cusp->cooPerm = NULL; 44087e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 44097e8381f9SStefano Zampini } 44107e8381f9SStefano Zampini if (n) { 4411e8729f6fSJunchao Zhang thrust::device_ptr<PetscInt> d_i, d_j; 4412e8729f6fSJunchao Zhang PetscInt *d_raw_i, *d_raw_j; 4413e8729f6fSJunchao Zhang PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4414e8729f6fSJunchao Zhang PetscMemType imtype, jmtype; 4415e8729f6fSJunchao Zhang 4416e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_i, &imtype)); 4417e8729f6fSJunchao Zhang if (PetscMemTypeHost(imtype)) { 4418e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4419e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4420e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(d_raw_i); 4421e8729f6fSJunchao Zhang free_raw_i = PETSC_TRUE; 4422e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4423e8729f6fSJunchao Zhang } else { 4424e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(coo_i); 4425e8729f6fSJunchao Zhang } 4426e8729f6fSJunchao Zhang 4427e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_j, &jmtype)); 4428e8729f6fSJunchao Zhang if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4429e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4430e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4431e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(d_raw_j); 4432e8729f6fSJunchao Zhang free_raw_j = PETSC_TRUE; 4433e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4434e8729f6fSJunchao Zhang } else { 4435e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(coo_j); 4436e8729f6fSJunchao Zhang } 4437e8729f6fSJunchao Zhang 44387e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 44397e8381f9SStefano Zampini 4440ad540459SPierre Jolivet if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n); 4441ad540459SPierre Jolivet if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n); 44427e8381f9SStefano Zampini 4443ddea5d60SJunchao Zhang /* Ex. 4444ddea5d60SJunchao Zhang n = 6 4445ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4446ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4447ddea5d60SJunchao Zhang */ 4448e8729f6fSJunchao Zhang auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4449e8729f6fSJunchao Zhang auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 44507e8381f9SStefano Zampini 44519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 44527e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4453ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4454e8729f6fSJunchao Zhang (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4455e8729f6fSJunchao Zhang THRUSTINTARRAY w(d_j, d_j + n); 44567e8381f9SStefano Zampini 4457ddea5d60SJunchao Zhang /* 4458ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4459ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4460ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4461ddea5d60SJunchao Zhang */ 4462ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4463ddea5d60SJunchao Zhang 4464ddea5d60SJunchao Zhang /* 4465ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4466ddea5d60SJunchao Zhang ^ekey 4467ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4468ddea5d60SJunchao Zhang ^nekye 4469ddea5d60SJunchao Zhang */ 44707e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 44717e8381f9SStefano Zampini delete cusp->cooPerm_a; 44727e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4473ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4474ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4475ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4476ddea5d60SJunchao Zhang adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4477ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 44787e8381f9SStefano Zampini w[0] = 0; 4479ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4480ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 44817e8381f9SStefano Zampini } 44827e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4483e8729f6fSJunchao Zhang thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4484ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4485ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 44869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 44877e8381f9SStefano Zampini 44889566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 44897e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 44907e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 44917e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 44929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4493ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 44949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 44957e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4496fcdce8c4SStefano Zampini a->rmax = 0; 44979566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->a)); 44989566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->j)); 4499e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 45009566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 45019566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 45027e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 45037e8381f9SStefano Zampini const PetscInt nnzr = a->i[i + 1] - a->i[i]; 45047e8381f9SStefano Zampini nzr += (PetscInt) !!(nnzr); 45057e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4506fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax, nnzr); 45077e8381f9SStefano Zampini } 4508fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 45097e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 45109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 45119566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4512e8729f6fSJunchao Zhang if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4513e8729f6fSJunchao Zhang if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 45147e8381f9SStefano Zampini } else { 45159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 45167e8381f9SStefano Zampini } 45179566063dSJacob Faibussowitsch PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 45187e8381f9SStefano Zampini 45197e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4520e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 45219566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->nz)); 45229566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 45237e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 45249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 45259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 45263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 45277e8381f9SStefano Zampini } 4528ed502f03SStefano Zampini 4529d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4530d71ae5a4SJacob Faibussowitsch { 4531219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4532219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4533cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4534219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4535219fbbafSJunchao Zhang 4536219fbbafSJunchao Zhang PetscFunctionBegin; 45379566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 45389566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4539219fbbafSJunchao Zhang if (coo_i) { 45409566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 4541219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4542219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { 45439371c9d4SSatish Balay if (coo_i[k] < 0 || coo_j[k] < 0) { 45449371c9d4SSatish Balay coo_basic = PETSC_FALSE; 45459371c9d4SSatish Balay break; 45469371c9d4SSatish Balay } 4547219fbbafSJunchao Zhang } 4548219fbbafSJunchao Zhang } 4549219fbbafSJunchao Zhang } 4550219fbbafSJunchao Zhang 4551219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 45529566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4553219fbbafSJunchao Zhang } else { 45549566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4555cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 45569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4557219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(mat->data); 4558219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 45599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 45609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 45619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 45629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4563219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4564219fbbafSJunchao Zhang } 45653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4566219fbbafSJunchao Zhang } 4567219fbbafSJunchao Zhang 4568d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4569d71ae5a4SJacob Faibussowitsch { 4570219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4571219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4572b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4573b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4574b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4575b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4576b6c38306SJunchao Zhang } 4577219fbbafSJunchao Zhang } 4578219fbbafSJunchao Zhang 4579d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4580d71ae5a4SJacob Faibussowitsch { 4581219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4582219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4583219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4584219fbbafSJunchao Zhang PetscMemType memtype; 4585219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4586219fbbafSJunchao Zhang PetscScalar *Aa; 4587219fbbafSJunchao Zhang 4588219fbbafSJunchao Zhang PetscFunctionBegin; 4589219fbbafSJunchao Zhang if (dev->use_extended_coo) { 45909566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4591219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 45929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 45939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4594219fbbafSJunchao Zhang } 4595219fbbafSJunchao Zhang 45969566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 45979566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4598219fbbafSJunchao Zhang 4599cbc6b225SStefano Zampini if (Annz) { 4600b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 46019566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4602cbc6b225SStefano Zampini } 4603219fbbafSJunchao Zhang 46049566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 46059566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4606219fbbafSJunchao Zhang 46079566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4608219fbbafSJunchao Zhang } else { 46099566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4610219fbbafSJunchao Zhang } 46113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4612219fbbafSJunchao Zhang } 4613219fbbafSJunchao Zhang 46145b7e41feSStefano Zampini /*@C 46152ef1f0ffSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 46165b7e41feSStefano Zampini 46172ef1f0ffSBarry Smith Not Collective 46185b7e41feSStefano Zampini 46195b7e41feSStefano Zampini Input Parameters: 46205b7e41feSStefano Zampini + A - the matrix 462111a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 46225b7e41feSStefano Zampini 46235b7e41feSStefano Zampini Output Parameters: 462420f4b53cSBarry Smith + i - the CSR row pointers 462520f4b53cSBarry Smith - j - the CSR column indices 46265b7e41feSStefano Zampini 46275b7e41feSStefano Zampini Level: developer 46285b7e41feSStefano Zampini 462911a5261eSBarry Smith Note: 46305b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 46315b7e41feSStefano Zampini 46321cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 46335b7e41feSStefano Zampini @*/ 4634d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4635d71ae5a4SJacob Faibussowitsch { 46365f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 46375f101d05SStefano Zampini CsrMatrix *csr; 46385f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 46395f101d05SStefano Zampini 46405f101d05SStefano Zampini PetscFunctionBegin; 46415f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46423ba16761SJacob Faibussowitsch if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 46435f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4644aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 46459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 464628b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 46475f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 46485f101d05SStefano Zampini if (i) { 46495f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 46505f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 46515f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 46525f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 46539566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 46545f101d05SStefano Zampini } 46555f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 46565f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 46575f101d05SStefano Zampini } 46585f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 46593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46605f101d05SStefano Zampini } 46615f101d05SStefano Zampini 46625b7e41feSStefano Zampini /*@C 46632ef1f0ffSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 46645b7e41feSStefano Zampini 46652ef1f0ffSBarry Smith Not Collective 46665b7e41feSStefano Zampini 46675b7e41feSStefano Zampini Input Parameters: 46685b7e41feSStefano Zampini + A - the matrix 46692ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 467020f4b53cSBarry Smith . i - the CSR row pointers 467120f4b53cSBarry Smith - j - the CSR column indices 46725b7e41feSStefano Zampini 46735b7e41feSStefano Zampini Level: developer 46745b7e41feSStefano Zampini 46751cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 46765b7e41feSStefano Zampini @*/ 467720f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4678d71ae5a4SJacob Faibussowitsch { 46795f101d05SStefano Zampini PetscFunctionBegin; 46805f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46815f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46825f101d05SStefano Zampini if (i) *i = NULL; 46835f101d05SStefano Zampini if (j) *j = NULL; 468420f4b53cSBarry Smith (void)compressed; 46853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 46865f101d05SStefano Zampini } 46875f101d05SStefano Zampini 46885b7e41feSStefano Zampini /*@C 468911a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 46905b7e41feSStefano Zampini 46915b7e41feSStefano Zampini Not Collective 46925b7e41feSStefano Zampini 46935b7e41feSStefano Zampini Input Parameter: 469411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46955b7e41feSStefano Zampini 46965b7e41feSStefano Zampini Output Parameter: 46975b7e41feSStefano Zampini . a - pointer to the device data 46985b7e41feSStefano Zampini 46995b7e41feSStefano Zampini Level: developer 47005b7e41feSStefano Zampini 470111a5261eSBarry Smith Note: 470211a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 47035b7e41feSStefano Zampini 47041cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 47055b7e41feSStefano Zampini @*/ 4706d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4707d71ae5a4SJacob Faibussowitsch { 4708ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4709ed502f03SStefano Zampini CsrMatrix *csr; 4710ed502f03SStefano Zampini 4711ed502f03SStefano Zampini PetscFunctionBegin; 4712ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4713ed502f03SStefano Zampini PetscValidPointer(a, 2); 4714ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4715aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 47169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 471728b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4718ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 471928b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4720ed502f03SStefano Zampini *a = csr->values->data().get(); 47213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4722ed502f03SStefano Zampini } 4723ed502f03SStefano Zampini 47245b7e41feSStefano Zampini /*@C 472511a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 47265b7e41feSStefano Zampini 47275b7e41feSStefano Zampini Not Collective 47285b7e41feSStefano Zampini 47292ef1f0ffSBarry Smith Input Parameters: 47302ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 47312ef1f0ffSBarry Smith - a - pointer to the device data 47325b7e41feSStefano Zampini 47335b7e41feSStefano Zampini Level: developer 47345b7e41feSStefano Zampini 47351cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 47365b7e41feSStefano Zampini @*/ 4737d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4738d71ae5a4SJacob Faibussowitsch { 4739ed502f03SStefano Zampini PetscFunctionBegin; 4740ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4741ed502f03SStefano Zampini PetscValidPointer(a, 2); 4742ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4743ed502f03SStefano Zampini *a = NULL; 47443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4745ed502f03SStefano Zampini } 4746ed502f03SStefano Zampini 47475b7e41feSStefano Zampini /*@C 474811a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 47495b7e41feSStefano Zampini 47505b7e41feSStefano Zampini Not Collective 47515b7e41feSStefano Zampini 47525b7e41feSStefano Zampini Input Parameter: 475311a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 47545b7e41feSStefano Zampini 47555b7e41feSStefano Zampini Output Parameter: 47565b7e41feSStefano Zampini . a - pointer to the device data 47575b7e41feSStefano Zampini 47585b7e41feSStefano Zampini Level: developer 47595b7e41feSStefano Zampini 476011a5261eSBarry Smith Note: 476111a5261eSBarry Smith May trigger host-device copies if up-to-date matrix data is on host 47625b7e41feSStefano Zampini 47631cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 47645b7e41feSStefano Zampini @*/ 4765d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4766d71ae5a4SJacob Faibussowitsch { 4767039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4768039c6fbaSStefano Zampini CsrMatrix *csr; 4769039c6fbaSStefano Zampini 4770039c6fbaSStefano Zampini PetscFunctionBegin; 4771039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4772039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4773039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4774aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 47759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 477628b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4777039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 477828b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4779039c6fbaSStefano Zampini *a = csr->values->data().get(); 4780039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 47819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 47823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4783039c6fbaSStefano Zampini } 47845b7e41feSStefano Zampini /*@C 478511a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4786039c6fbaSStefano Zampini 47875b7e41feSStefano Zampini Not Collective 47885b7e41feSStefano Zampini 47892ef1f0ffSBarry Smith Input Parameters: 47902ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 47912ef1f0ffSBarry Smith - a - pointer to the device data 47925b7e41feSStefano Zampini 47935b7e41feSStefano Zampini Level: developer 47945b7e41feSStefano Zampini 47951cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 47965b7e41feSStefano Zampini @*/ 4797d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4798d71ae5a4SJacob Faibussowitsch { 4799039c6fbaSStefano Zampini PetscFunctionBegin; 4800039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4801039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4802039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 48039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 48049566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4805039c6fbaSStefano Zampini *a = NULL; 48063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4807039c6fbaSStefano Zampini } 4808039c6fbaSStefano Zampini 48095b7e41feSStefano Zampini /*@C 481011a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 48115b7e41feSStefano Zampini 48125b7e41feSStefano Zampini Not Collective 48135b7e41feSStefano Zampini 48145b7e41feSStefano Zampini Input Parameter: 481511a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 48165b7e41feSStefano Zampini 48175b7e41feSStefano Zampini Output Parameter: 48185b7e41feSStefano Zampini . a - pointer to the device data 48195b7e41feSStefano Zampini 48205b7e41feSStefano Zampini Level: developer 48215b7e41feSStefano Zampini 482211a5261eSBarry Smith Note: 482311a5261eSBarry Smith Does not trigger host-device copies and flags data validity on the GPU 48245b7e41feSStefano Zampini 48251cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 48265b7e41feSStefano Zampini @*/ 4827d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4828d71ae5a4SJacob Faibussowitsch { 4829ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4830ed502f03SStefano Zampini CsrMatrix *csr; 4831ed502f03SStefano Zampini 4832ed502f03SStefano Zampini PetscFunctionBegin; 4833ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4834ed502f03SStefano Zampini PetscValidPointer(a, 2); 4835ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4836aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 483728b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4838ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 483928b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4840ed502f03SStefano Zampini *a = csr->values->data().get(); 4841039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 48429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 48433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4844ed502f03SStefano Zampini } 4845ed502f03SStefano Zampini 48465b7e41feSStefano Zampini /*@C 484711a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 48485b7e41feSStefano Zampini 48495b7e41feSStefano Zampini Not Collective 48505b7e41feSStefano Zampini 48512ef1f0ffSBarry Smith Input Parameters: 48522ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 48532ef1f0ffSBarry Smith - a - pointer to the device data 48545b7e41feSStefano Zampini 48555b7e41feSStefano Zampini Level: developer 48565b7e41feSStefano Zampini 48571cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 48585b7e41feSStefano Zampini @*/ 4859d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4860d71ae5a4SJacob Faibussowitsch { 4861ed502f03SStefano Zampini PetscFunctionBegin; 4862ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4863ed502f03SStefano Zampini PetscValidPointer(a, 2); 4864ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 48659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 48669566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4867ed502f03SStefano Zampini *a = NULL; 48683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4869ed502f03SStefano Zampini } 4870ed502f03SStefano Zampini 48719371c9d4SSatish Balay struct IJCompare4 { 4872d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4873d71ae5a4SJacob Faibussowitsch { 4874ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4875ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4876ed502f03SStefano Zampini return false; 4877ed502f03SStefano Zampini } 4878ed502f03SStefano Zampini }; 4879ed502f03SStefano Zampini 48809371c9d4SSatish Balay struct Shift { 4881ed502f03SStefano Zampini int _shift; 4882ed502f03SStefano Zampini 4883ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 48849371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4885ed502f03SStefano Zampini }; 4886ed502f03SStefano Zampini 4887ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 4888d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4889d71ae5a4SJacob Faibussowitsch { 4890ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4891ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4892ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4893ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4894ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4895ed502f03SStefano Zampini cusparseStatus_t stat; 4896ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4897ed502f03SStefano Zampini 4898ed502f03SStefano Zampini PetscFunctionBegin; 4899ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4900ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4901ed502f03SStefano Zampini PetscValidPointer(C, 4); 4902ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4903ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 49045f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 490508401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4906aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4907aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4908ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4909ed502f03SStefano Zampini m = A->rmap->n; 4910ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 49119566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 49129566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 49139566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4914ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4915ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4916ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4917ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4918ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4919ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4920ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4921ed502f03SStefano Zampini c->compressedrow.i = NULL; 4922ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4923ed502f03SStefano Zampini Ccusp->workVector = NULL; 4924ed502f03SStefano Zampini Ccusp->nrows = m; 4925ed502f03SStefano Zampini Ccusp->mat = Cmat; 4926ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4927ed502f03SStefano Zampini Ccsr->num_rows = m; 4928ed502f03SStefano Zampini Ccsr->num_cols = n; 49299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 49309566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 49319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 49329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 49339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 49349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 49359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 49369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 49379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 49389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 49399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 494028b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 494128b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4942ed502f03SStefano Zampini 4943ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4944ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4945ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4946ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4947ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4948ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4949ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4950ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4951ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4952ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4953ed502f03SStefano Zampini if (c->nz) { 49542ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 49552ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 49562ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 49572ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 49582ed87e7eSStefano Zampini 4959ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4960ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4961ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4962ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 49639566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4964ed502f03SStefano Zampini } 49652ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 49662ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4967ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4968ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4969ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4970ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 49719566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4972ed502f03SStefano Zampini } 49732ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 49742ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 49759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 49769371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 49779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 49789371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 49799371c9d4SSatish Balay PetscCallCUSPARSE(stat); 49802ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 49812ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 49822ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 49838909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4984ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4985ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 49868909a122SStefano Zampini #else 49878909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 49888909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 49898909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 49908909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 49918909a122SStefano Zampini #endif 49922ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 49932ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 49942ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 49952ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 49962ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 49972ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4998ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4999ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 5000ed502f03SStefano Zampini thrust::advance(p2, Annz); 5001792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 50028909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 50038909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 50048909a122SStefano Zampini #endif 50052ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 50062ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 50072ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 5008792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 50092ed87e7eSStefano Zampini #else 50102ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 5011792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 5012792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 50132ed87e7eSStefano Zampini #endif 50149371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 50159371c9d4SSatish Balay PetscCallCUSPARSE(stat); 50169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 50172ed87e7eSStefano Zampini delete wPerm; 50182ed87e7eSStefano Zampini delete Acoo; 50192ed87e7eSStefano Zampini delete Bcoo; 50202ed87e7eSStefano Zampini delete Ccoo; 5021ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 50229371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 50239371c9d4SSatish Balay PetscCallCUSPARSE(stat); 5024ed502f03SStefano Zampini #endif 50251a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 50269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 50279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 5028ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5029ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 5030ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 5031ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5032ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5033ed502f03SStefano Zampini 50341a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 50351a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5036a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 5037ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 5038ed502f03SStefano Zampini CmatT->mat = CcsrT; 5039ed502f03SStefano Zampini CcsrT->num_rows = n; 5040ed502f03SStefano Zampini CcsrT->num_cols = m; 5041ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 5042ed502f03SStefano Zampini 5043ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 5044ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 5045ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 5046ed502f03SStefano Zampini 50479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5048ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 5049ed502f03SStefano Zampini if (AT) { 5050ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 5051ed502f03SStefano Zampini thrust::advance(rT, -1); 5052ed502f03SStefano Zampini } 5053ed502f03SStefano Zampini if (BT) { 5054ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 5055ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 5056ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 5057ed502f03SStefano Zampini } 5058ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 5059ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 5060ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 5061ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5062ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5063ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 50649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5065ed502f03SStefano Zampini 50669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 50679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 50689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 50699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 50709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 50719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 50729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 50739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 50749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 5075ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 50769371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 50779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 5078ed502f03SStefano Zampini #endif 5079ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 5080ed502f03SStefano Zampini } 5081ed502f03SStefano Zampini } 5082ed502f03SStefano Zampini 5083ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 5084ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 5085ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 50869566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 50879566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 50887de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 5089ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 5090ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 5091ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 5092ed502f03SStefano Zampini jj = *Ccsr->column_indices; 50939566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 50949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5095ed502f03SStefano Zampini } else { 50969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 50979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 5098ed502f03SStefano Zampini } 50999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 51009566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 51019566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 5102ed502f03SStefano Zampini c->maxnz = c->nz; 5103ed502f03SStefano Zampini c->nonzerorowcnt = 0; 5104ed502f03SStefano Zampini c->rmax = 0; 5105ed502f03SStefano Zampini for (i = 0; i < m; i++) { 5106ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 5107ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 5108ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 5109ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 5110ed502f03SStefano Zampini } 51119566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 51129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 5113ed502f03SStefano Zampini (*C)->nonzerostate++; 51149566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 51159566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 5116ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 5117ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 5118ed502f03SStefano Zampini } else { 511908401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 5120ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 5121ed502f03SStefano Zampini if (c->nz) { 5122ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 51235f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 5124aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 512508401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 51269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 51279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 51285f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 51295f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 5130ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 5131ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 5132ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 5133aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 5134aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 5135aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 5136aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 51375f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 5138ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 5139ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 51409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 51419371c9d4SSatish Balay auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 51429371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 5143ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 51449371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 51459371c9d4SSatish Balay auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 5146ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 51479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 51481a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 51495f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 5150ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5151ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5152ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5153ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5154ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 5155ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5156ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 51571a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 5158ed502f03SStefano Zampini } 51599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5160ed502f03SStefano Zampini } 5161ed502f03SStefano Zampini } 51629566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5163ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 5164ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 5165ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 51663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5167ed502f03SStefano Zampini } 5168c215019aSStefano Zampini 5169d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5170d71ae5a4SJacob Faibussowitsch { 5171c215019aSStefano Zampini bool dmem; 5172c215019aSStefano Zampini const PetscScalar *av; 5173c215019aSStefano Zampini 5174c215019aSStefano Zampini PetscFunctionBegin; 5175c215019aSStefano Zampini dmem = isCudaMem(v); 51769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5177c215019aSStefano Zampini if (n && idx) { 5178c215019aSStefano Zampini THRUSTINTARRAY widx(n); 5179c215019aSStefano Zampini widx.assign(idx, idx + n); 51809566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5181c215019aSStefano Zampini 5182c215019aSStefano Zampini THRUSTARRAY *w = NULL; 5183c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 5184c215019aSStefano Zampini if (dmem) { 5185c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 5186c215019aSStefano Zampini } else { 5187c215019aSStefano Zampini w = new THRUSTARRAY(n); 5188c215019aSStefano Zampini dv = w->data(); 5189c215019aSStefano Zampini } 5190c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5191c215019aSStefano Zampini 5192c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5193c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5194c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 519548a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5196c215019aSStefano Zampini delete w; 5197c215019aSStefano Zampini } else { 51989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5199c215019aSStefano Zampini } 52009566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 52019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 52023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5203c215019aSStefano Zampini } 5204