19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 79ae82921SPaul Mullowney 83d13b8fdSMatthew G. Knepley #include <petscconf.h> 93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 12af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 139ae82921SPaul Mullowney #undef VecType 143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 15a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h> 17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 18a2cee5feSJed Brown #include <thrust/remove.h> 19a2cee5feSJed Brown #include <thrust/sort.h> 20a2cee5feSJed Brown #include <thrust/unique.h> 21e8d2b73aSMark Adams 22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 25afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 26afb2bd1cSJunchao Zhang 27afb2bd1cSJunchao Zhang typedef enum { 28afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 29afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 30afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 31afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 32afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 33afb2bd1cSJunchao Zhang 34afb2bd1cSJunchao Zhang typedef enum { 35afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 42afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 43afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 44afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 45afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 46afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 47afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 48afb2bd1cSJunchao Zhang 49afb2bd1cSJunchao Zhang typedef enum { 50afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 51afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 52afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 53afb2bd1cSJunchao Zhang */ 54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 57afb2bd1cSJunchao Zhang #endif 589ae82921SPaul Mullowney 59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 62087f3262SPaul Mullowney 636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 66087f3262SPaul Mullowney 676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 71dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject); 72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 819ae82921SPaul Mullowney 827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **); 877f756511SDominic Meiser 8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 9057181aedSStefano Zampini 91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 92e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 94c215019aSStefano Zampini 959371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 96aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 976e111a19SKarl Rupp 98ca45077fSPaul Mullowney PetscFunctionBegin; 99ca45077fSPaul Mullowney switch (op) { 1009371c9d4SSatish Balay case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break; 1019371c9d4SSatish Balay case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break; 1029371c9d4SSatish Balay default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 103ca45077fSPaul Mullowney } 104ca45077fSPaul Mullowney PetscFunctionReturn(0); 105ca45077fSPaul Mullowney } 1069ae82921SPaul Mullowney 107e057df02SPaul Mullowney /*@ 108e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 109e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 110aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 111e057df02SPaul Mullowney Not Collective 112e057df02SPaul Mullowney 113e057df02SPaul Mullowney Input Parameters: 1148468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 11536d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1162692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 117e057df02SPaul Mullowney 118e057df02SPaul Mullowney Output Parameter: 119e057df02SPaul Mullowney 120e057df02SPaul Mullowney Level: intermediate 121e057df02SPaul Mullowney 122db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 123e057df02SPaul Mullowney @*/ 1249371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 125e057df02SPaul Mullowney PetscFunctionBegin; 126e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 127cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 128e057df02SPaul Mullowney PetscFunctionReturn(0); 129e057df02SPaul Mullowney } 130e057df02SPaul Mullowney 1319371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) { 132365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 133365b711fSMark Adams 134365b711fSMark Adams PetscFunctionBegin; 135365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 136365b711fSMark Adams PetscFunctionReturn(0); 137365b711fSMark Adams } 138365b711fSMark Adams 139365b711fSMark Adams /*@ 140365b711fSMark Adams MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve. 141365b711fSMark Adams 142365b711fSMark Adams Input Parameters: 143365b711fSMark Adams + A - Matrix of type SEQAIJCUSPARSE 144365b711fSMark Adams - use_cpu - set flag for using the built-in CPU MatSolve 145365b711fSMark Adams 146365b711fSMark Adams Output Parameter: 147365b711fSMark Adams 148365b711fSMark Adams Notes: 149365b711fSMark Adams The cuSparse LU solver currently computes the factors with the built-in CPU method 150365b711fSMark Adams and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there. 151365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 152365b711fSMark Adams 153365b711fSMark Adams Level: intermediate 154365b711fSMark Adams 155db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 156365b711fSMark Adams @*/ 1579371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) { 158365b711fSMark Adams PetscFunctionBegin; 159365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 160cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 161365b711fSMark Adams PetscFunctionReturn(0); 162365b711fSMark Adams } 163365b711fSMark Adams 1649371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) { 165e6e9a74fSStefano Zampini PetscFunctionBegin; 1661a2c6b5cSJunchao Zhang switch (op) { 1671a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1681a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1699566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1701a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1711a2c6b5cSJunchao Zhang break; 1729371c9d4SSatish Balay default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 173e6e9a74fSStefano Zampini } 174e6e9a74fSStefano Zampini PetscFunctionReturn(0); 175e6e9a74fSStefano Zampini } 176e6e9a74fSStefano Zampini 177bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A); 178bddcd29dSMark Adams 1799371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 180bddcd29dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 181bddcd29dSMark Adams IS isrow = b->row, iscol = b->col; 182bddcd29dSMark Adams PetscBool row_identity, col_identity; 183365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr; 184bddcd29dSMark Adams 185bddcd29dSMark Adams PetscFunctionBegin; 1869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1879566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 188bddcd29dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 189bddcd29dSMark Adams /* determine which version of MatSolve needs to be used. */ 1909566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 1919566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 192f93f8571SJunchao Zhang 193365b711fSMark Adams if (!cusparsestruct->use_cpu_solve) { 194f93f8571SJunchao Zhang if (row_identity && col_identity) { 195bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 196bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 197bddcd29dSMark Adams } else { 198bddcd29dSMark Adams B->ops->solve = MatSolve_SeqAIJCUSPARSE; 199bddcd29dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 200365b711fSMark Adams } 201f93f8571SJunchao Zhang } 202bddcd29dSMark Adams B->ops->matsolve = NULL; 203bddcd29dSMark Adams B->ops->matsolvetranspose = NULL; 204bddcd29dSMark Adams 205bddcd29dSMark Adams /* get the triangular factors */ 206*48a46eb9SPierre Jolivet if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 207bddcd29dSMark Adams PetscFunctionReturn(0); 208bddcd29dSMark Adams } 209bddcd29dSMark Adams 2109371c9d4SSatish Balay static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 211e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2129ae82921SPaul Mullowney PetscBool flg; 213a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2146e111a19SKarl Rupp 2159ae82921SPaul Mullowney PetscFunctionBegin; 216d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 2179ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 2189371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2199566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 220afb2bd1cSJunchao Zhang 2219371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 2229566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 2239566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 2249566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 225afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2269371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 227afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 228ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 229aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 230a435da06SStefano Zampini #else 231aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 232a435da06SStefano Zampini #endif 2339371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 234aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 235afb2bd1cSJunchao Zhang 2369371c9d4SSatish Balay PetscCall( 2379371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 238aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 239afb2bd1cSJunchao Zhang #endif 2404c87dfd4SPaul Mullowney } 241d0609cedSBarry Smith PetscOptionsHeadEnd(); 2429ae82921SPaul Mullowney PetscFunctionReturn(0); 2439ae82921SPaul Mullowney } 2449ae82921SPaul Mullowney 2459371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) { 2469ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2479ae82921SPaul Mullowney PetscInt n = A->rmap->n; 2489ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 249aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 2509ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 2519ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 2529ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 2539ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 2549ae82921SPaul Mullowney 2559ae82921SPaul Mullowney PetscFunctionBegin; 256cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 257c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2589ae82921SPaul Mullowney try { 2599ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 2609ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 261da79fbbcSStefano Zampini if (!loTriFactor) { 2622cbc15d9SMark PetscScalar *AALo; 2632cbc15d9SMark 2649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 2659ae82921SPaul Mullowney 2669ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 2679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 2689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 2699ae82921SPaul Mullowney 2709ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 2719ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 2729ae82921SPaul Mullowney AiLo[n] = nzLower; 2739ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 2749ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 2759ae82921SPaul Mullowney v = aa; 2769ae82921SPaul Mullowney vi = aj; 2779ae82921SPaul Mullowney offset = 1; 2789ae82921SPaul Mullowney rowOffset = 1; 2799ae82921SPaul Mullowney for (i = 1; i < n; i++) { 2809ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 281e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 2829ae82921SPaul Mullowney AiLo[i] = rowOffset; 2839ae82921SPaul Mullowney rowOffset += nz + 1; 2849ae82921SPaul Mullowney 2859566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz)); 2869566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AALo[offset]), v, nz)); 2879ae82921SPaul Mullowney 2889ae82921SPaul Mullowney offset += nz; 2899ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 2909ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 2919ae82921SPaul Mullowney offset += 1; 2929ae82921SPaul Mullowney 2939ae82921SPaul Mullowney v += nz; 2949ae82921SPaul Mullowney vi += nz; 2959ae82921SPaul Mullowney } 2962205254eSKarl Rupp 297aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 2989566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 299da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 300aa372e3fSPaul Mullowney /* Create the matrix description */ 3019566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3029566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 305afb2bd1cSJunchao Zhang #else 3069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 307afb2bd1cSJunchao Zhang #endif 3089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 310aa372e3fSPaul Mullowney 311aa372e3fSPaul Mullowney /* set the operation */ 312aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 313aa372e3fSPaul Mullowney 314aa372e3fSPaul Mullowney /* set the matrix */ 315aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 316aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 317aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 318aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 319aa372e3fSPaul Mullowney 320aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 321aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 322aa372e3fSPaul Mullowney 323aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 324aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 325aa372e3fSPaul Mullowney 326aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 327aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 328aa372e3fSPaul Mullowney 329afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3309566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 331261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3339371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3349371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 3359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 336afb2bd1cSJunchao Zhang #endif 337afb2bd1cSJunchao Zhang 338aa372e3fSPaul Mullowney /* perform the solve analysis */ 3399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3409371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 3411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3429371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 343d49cd2b7SBarry Smith #else 3445f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 345afb2bd1cSJunchao Zhang #endif 3469566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 3479566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 348aa372e3fSPaul Mullowney 349da79fbbcSStefano Zampini /* assign the pointer */ 350aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 3512cbc15d9SMark loTriFactor->AA_h = AALo; 3529566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 3539566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 3549566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 355da79fbbcSStefano Zampini } else { /* update values only */ 356*48a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 357da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 3582cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 359da79fbbcSStefano Zampini v = aa; 360da79fbbcSStefano Zampini vi = aj; 361da79fbbcSStefano Zampini offset = 1; 362da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 363da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 3649566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz)); 365da79fbbcSStefano Zampini offset += nz; 3662cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 367da79fbbcSStefano Zampini offset += 1; 368da79fbbcSStefano Zampini v += nz; 369da79fbbcSStefano Zampini } 3702cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 3719566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 372da79fbbcSStefano Zampini } 3739371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3749ae82921SPaul Mullowney } 3759ae82921SPaul Mullowney PetscFunctionReturn(0); 3769ae82921SPaul Mullowney } 3779ae82921SPaul Mullowney 3789371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) { 3799ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3809ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3819ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 382aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 3839ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 3849ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3859ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 3869ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 3879ae82921SPaul Mullowney 3889ae82921SPaul Mullowney PetscFunctionBegin; 389cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 390c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3919ae82921SPaul Mullowney try { 3929ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 3939ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 394da79fbbcSStefano Zampini if (!upTriFactor) { 3952cbc15d9SMark PetscScalar *AAUp; 3962cbc15d9SMark 3979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 3982cbc15d9SMark 3999ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4029ae82921SPaul Mullowney 4039ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4049ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4059ae82921SPaul Mullowney AiUp[n] = nzUpper; 4069ae82921SPaul Mullowney offset = nzUpper; 4079ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4089ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4099ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4109ae82921SPaul Mullowney 411e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4129ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4139ae82921SPaul Mullowney 414e057df02SPaul Mullowney /* decrement the offset */ 4159ae82921SPaul Mullowney offset -= (nz + 1); 4169ae82921SPaul Mullowney 417e057df02SPaul Mullowney /* first, set the diagonal elements */ 4189ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 41909f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4209ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4219ae82921SPaul Mullowney 4229566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz)); 4239566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz)); 4249ae82921SPaul Mullowney } 4252205254eSKarl Rupp 426aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4279566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 428da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4292205254eSKarl Rupp 430aa372e3fSPaul Mullowney /* Create the matrix description */ 4319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 435afb2bd1cSJunchao Zhang #else 4369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 437afb2bd1cSJunchao Zhang #endif 4389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 440aa372e3fSPaul Mullowney 441aa372e3fSPaul Mullowney /* set the operation */ 442aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 443aa372e3fSPaul Mullowney 444aa372e3fSPaul Mullowney /* set the matrix */ 445aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 446aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 447aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 448aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 449aa372e3fSPaul Mullowney 450aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 451aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 452aa372e3fSPaul Mullowney 453aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 454aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 455aa372e3fSPaul Mullowney 456aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 457aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 458aa372e3fSPaul Mullowney 459afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 4609566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 461261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 4621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4649371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 4659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 466afb2bd1cSJunchao Zhang #endif 467afb2bd1cSJunchao Zhang 468aa372e3fSPaul Mullowney /* perform the solve analysis */ 4699371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 4709371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 4711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4729371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 473d49cd2b7SBarry Smith #else 4745f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 475afb2bd1cSJunchao Zhang #endif 4769566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4779566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 478aa372e3fSPaul Mullowney 479da79fbbcSStefano Zampini /* assign the pointer */ 480aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 4812cbc15d9SMark upTriFactor->AA_h = AAUp; 4829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 4839566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 4849566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 485da79fbbcSStefano Zampini } else { 486*48a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 487da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 488da79fbbcSStefano Zampini offset = nzUpper; 489da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 490da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 491da79fbbcSStefano Zampini 492da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 493da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 494da79fbbcSStefano Zampini 495da79fbbcSStefano Zampini /* decrement the offset */ 496da79fbbcSStefano Zampini offset -= (nz + 1); 497da79fbbcSStefano Zampini 498da79fbbcSStefano Zampini /* first, set the diagonal elements */ 4992cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 5009566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz)); 501da79fbbcSStefano Zampini } 5022cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5039566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 504da79fbbcSStefano Zampini } 5059371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 5069ae82921SPaul Mullowney } 5079ae82921SPaul Mullowney PetscFunctionReturn(0); 5089ae82921SPaul Mullowney } 5099ae82921SPaul Mullowney 5109371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) { 5119ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5129ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 5139ae82921SPaul Mullowney IS isrow = a->row, iscol = a->icol; 5149ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5159ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5169ae82921SPaul Mullowney 5179ae82921SPaul Mullowney PetscFunctionBegin; 51828b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 5199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 5212205254eSKarl Rupp 522da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 523aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 5249ae82921SPaul Mullowney 525c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 526e057df02SPaul Mullowney /* lower triangular indices */ 5279566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 528da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 529da79fbbcSStefano Zampini const PetscInt *r; 530da79fbbcSStefano Zampini 5319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 532aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 533aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 5349566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 5359566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 536da79fbbcSStefano Zampini } 5379ae82921SPaul Mullowney 538e057df02SPaul Mullowney /* upper triangular indices */ 5399566063dSJacob Faibussowitsch PetscCall(ISIdentity(iscol, &col_identity)); 540da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 541da79fbbcSStefano Zampini const PetscInt *c; 542da79fbbcSStefano Zampini 5439566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iscol, &c)); 544aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 545aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 5469566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &c)); 5479566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 548da79fbbcSStefano Zampini } 5499ae82921SPaul Mullowney PetscFunctionReturn(0); 5509ae82921SPaul Mullowney } 5519ae82921SPaul Mullowney 5529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) { 553087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 554087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 555aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 556aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 557087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 558087f3262SPaul Mullowney PetscScalar *AAUp; 559087f3262SPaul Mullowney PetscScalar *AALo; 560087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 561087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 562087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 563087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 564087f3262SPaul Mullowney 565087f3262SPaul Mullowney PetscFunctionBegin; 566cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 567c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 568087f3262SPaul Mullowney try { 5699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 5709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 571da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 572087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 5739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 5749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 575087f3262SPaul Mullowney 576087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 577087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 578087f3262SPaul Mullowney AiUp[n] = nzUpper; 579087f3262SPaul Mullowney offset = 0; 580087f3262SPaul Mullowney for (i = 0; i < n; i++) { 581087f3262SPaul Mullowney /* set the pointers */ 582087f3262SPaul Mullowney v = aa + ai[i]; 583087f3262SPaul Mullowney vj = aj + ai[i]; 584087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 585087f3262SPaul Mullowney 586087f3262SPaul Mullowney /* first, set the diagonal elements */ 587087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 58809f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 589087f3262SPaul Mullowney AiUp[i] = offset; 59009f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 591087f3262SPaul Mullowney 592087f3262SPaul Mullowney offset += 1; 593087f3262SPaul Mullowney if (nz > 0) { 5949566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz)); 5959566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 596087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 597087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 598087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 599087f3262SPaul Mullowney } 600087f3262SPaul Mullowney offset += nz; 601087f3262SPaul Mullowney } 602087f3262SPaul Mullowney } 603087f3262SPaul Mullowney 604aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6059566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 606da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 607087f3262SPaul Mullowney 608aa372e3fSPaul Mullowney /* Create the matrix description */ 6099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 6109566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 613afb2bd1cSJunchao Zhang #else 6149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 615afb2bd1cSJunchao Zhang #endif 6169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 618087f3262SPaul Mullowney 619aa372e3fSPaul Mullowney /* set the matrix */ 620aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 621aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 622aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 623aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 624aa372e3fSPaul Mullowney 625aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 626aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 627aa372e3fSPaul Mullowney 628aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 629aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 630aa372e3fSPaul Mullowney 631aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 632aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 633aa372e3fSPaul Mullowney 634afb2bd1cSJunchao Zhang /* set the operation */ 635afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 636afb2bd1cSJunchao Zhang 637afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6389566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 639261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 6401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6419371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6429371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 6439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 644afb2bd1cSJunchao Zhang #endif 645afb2bd1cSJunchao Zhang 646aa372e3fSPaul Mullowney /* perform the solve analysis */ 6479371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 6489371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), 6491b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6509371c9d4SSatish Balay upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 651d49cd2b7SBarry Smith #else 6525f80ce2aSJacob Faibussowitsch upTriFactor->solveInfo)); 653afb2bd1cSJunchao Zhang #endif 6549566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 6559566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 656aa372e3fSPaul Mullowney 657da79fbbcSStefano Zampini /* assign the pointer */ 658aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 659aa372e3fSPaul Mullowney 660aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 6619566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 662da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 663aa372e3fSPaul Mullowney 664aa372e3fSPaul Mullowney /* Create the matrix description */ 6659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 6669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 6671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 669afb2bd1cSJunchao Zhang #else 6709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 671afb2bd1cSJunchao Zhang #endif 6729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 6739566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 674aa372e3fSPaul Mullowney 675aa372e3fSPaul Mullowney /* set the operation */ 676aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 677aa372e3fSPaul Mullowney 678aa372e3fSPaul Mullowney /* set the matrix */ 679aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 680aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 681aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 682aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 683aa372e3fSPaul Mullowney 684aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 685aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 686aa372e3fSPaul Mullowney 687aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 688aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 689aa372e3fSPaul Mullowney 690aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 691aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 692aa372e3fSPaul Mullowney 693afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 6949566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 695261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 6961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 6979371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 6989371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 6999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 700afb2bd1cSJunchao Zhang #endif 701afb2bd1cSJunchao Zhang 702aa372e3fSPaul Mullowney /* perform the solve analysis */ 7039371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 7049371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), 7051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 7069371c9d4SSatish Balay loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 707d49cd2b7SBarry Smith #else 7085f80ce2aSJacob Faibussowitsch loTriFactor->solveInfo)); 709afb2bd1cSJunchao Zhang #endif 7109566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 7119566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 712aa372e3fSPaul Mullowney 713da79fbbcSStefano Zampini /* assign the pointer */ 714aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 715087f3262SPaul Mullowney 7169566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 7179566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 7189566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 719da79fbbcSStefano Zampini } else { 720da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 721da79fbbcSStefano Zampini offset = 0; 722da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 723da79fbbcSStefano Zampini /* set the pointers */ 724da79fbbcSStefano Zampini v = aa + ai[i]; 725da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 726da79fbbcSStefano Zampini 727da79fbbcSStefano Zampini /* first, set the diagonal elements */ 728da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 729da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 730da79fbbcSStefano Zampini 731da79fbbcSStefano Zampini offset += 1; 732da79fbbcSStefano Zampini if (nz > 0) { 7339566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz)); 734da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 735da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 736da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 737da79fbbcSStefano Zampini } 738da79fbbcSStefano Zampini offset += nz; 739da79fbbcSStefano Zampini } 740da79fbbcSStefano Zampini } 74128b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 74228b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 743da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 744da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 7459566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 746da79fbbcSStefano Zampini } 7479566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 7489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 7499371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 750087f3262SPaul Mullowney } 751087f3262SPaul Mullowney PetscFunctionReturn(0); 752087f3262SPaul Mullowney } 753087f3262SPaul Mullowney 7549371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) { 755087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 756087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 757087f3262SPaul Mullowney IS ip = a->row; 758087f3262SPaul Mullowney PetscBool perm_identity; 759087f3262SPaul Mullowney PetscInt n = A->rmap->n; 760087f3262SPaul Mullowney 761087f3262SPaul Mullowney PetscFunctionBegin; 76228b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 7639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 764da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 765aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 766aa372e3fSPaul Mullowney 767da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 768da79fbbcSStefano Zampini 769087f3262SPaul Mullowney /* lower triangular indices */ 7709566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 771087f3262SPaul Mullowney if (!perm_identity) { 7724e4bbfaaSStefano Zampini IS iip; 773da79fbbcSStefano Zampini const PetscInt *irip, *rip; 7744e4bbfaaSStefano Zampini 7759566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 7769566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 7779566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 778aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 779aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 780aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 7814e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 7829566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 7839566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 7849566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 7859566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 786da79fbbcSStefano Zampini } 787087f3262SPaul Mullowney PetscFunctionReturn(0); 788087f3262SPaul Mullowney } 789087f3262SPaul Mullowney 7909371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) { 791087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 792087f3262SPaul Mullowney IS ip = b->row; 793087f3262SPaul Mullowney PetscBool perm_identity; 794087f3262SPaul Mullowney 795087f3262SPaul Mullowney PetscFunctionBegin; 7969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 7979566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 798ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 799087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8009566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 801087f3262SPaul Mullowney if (perm_identity) { 802087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 803087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8044e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8054e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 806087f3262SPaul Mullowney } else { 807087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 808087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8094e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8104e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 811087f3262SPaul Mullowney } 812087f3262SPaul Mullowney 813087f3262SPaul Mullowney /* get the triangular factors */ 8149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 815087f3262SPaul Mullowney PetscFunctionReturn(0); 816087f3262SPaul Mullowney } 8179ae82921SPaul Mullowney 8189371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) { 819bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 820aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 821aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 822da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 823da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 824aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 825aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 826aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 827aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 828b175d8bbSPaul Mullowney 829bda325fcSPaul Mullowney PetscFunctionBegin; 830aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 8319566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 832da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 833aa372e3fSPaul Mullowney 834aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 835aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 836aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 8379371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 838aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 839aa372e3fSPaul Mullowney 840aa372e3fSPaul Mullowney /* Create the matrix description */ 8419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 8429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 8439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 8449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 8459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 846aa372e3fSPaul Mullowney 847aa372e3fSPaul Mullowney /* set the operation */ 848aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 849aa372e3fSPaul Mullowney 850aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 851aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 852afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 853afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 854aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 855afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 856afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 857afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 858aa372e3fSPaul Mullowney 859aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 860afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 8619371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 8629371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 8639371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 8649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 865afb2bd1cSJunchao Zhang #endif 866afb2bd1cSJunchao Zhang 8679566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 8689371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 8699371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 870afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 8719371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer)); 872afb2bd1cSJunchao Zhang #else 8739371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 874afb2bd1cSJunchao Zhang #endif 8759566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8769566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 877aa372e3fSPaul Mullowney 878afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8799566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 880261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 8811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8829371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 8839371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 8849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 885afb2bd1cSJunchao Zhang #endif 886afb2bd1cSJunchao Zhang 887afb2bd1cSJunchao Zhang /* perform the solve analysis */ 8889371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 8899371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), 8901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8919371c9d4SSatish Balay loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 892d49cd2b7SBarry Smith #else 8935f80ce2aSJacob Faibussowitsch loTriFactorT->solveInfo)); 894afb2bd1cSJunchao Zhang #endif 8959566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8969566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 897aa372e3fSPaul Mullowney 898da79fbbcSStefano Zampini /* assign the pointer */ 899aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 900aa372e3fSPaul Mullowney 901aa372e3fSPaul Mullowney /*********************************************/ 902aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 903aa372e3fSPaul Mullowney /*********************************************/ 904aa372e3fSPaul Mullowney 905aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 9069566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 907da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 908aa372e3fSPaul Mullowney 909aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 910aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 911aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 9129371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 913aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 914aa372e3fSPaul Mullowney 915aa372e3fSPaul Mullowney /* Create the matrix description */ 9169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 9179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 9189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 9199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 9209566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 921aa372e3fSPaul Mullowney 922aa372e3fSPaul Mullowney /* set the operation */ 923aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 924aa372e3fSPaul Mullowney 925aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 926aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 927afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 928afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 929aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 930afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 931afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 932afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 933aa372e3fSPaul Mullowney 934aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 935afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9369371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 9379371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 9389371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 9399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 940afb2bd1cSJunchao Zhang #endif 941afb2bd1cSJunchao Zhang 9429566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 9439371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 9449371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 945afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 9469371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer)); 947afb2bd1cSJunchao Zhang #else 9489371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase)); 949afb2bd1cSJunchao Zhang #endif 950d49cd2b7SBarry Smith 9519566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9529566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 953aa372e3fSPaul Mullowney 954afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9559566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 956261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 9571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9589371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9599371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 9609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 961afb2bd1cSJunchao Zhang #endif 962afb2bd1cSJunchao Zhang 963afb2bd1cSJunchao Zhang /* perform the solve analysis */ 9645f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 9659371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 9669371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), 9671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9689371c9d4SSatish Balay upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 969d49cd2b7SBarry Smith #else 9705f80ce2aSJacob Faibussowitsch upTriFactorT->solveInfo)); 971afb2bd1cSJunchao Zhang #endif 972d49cd2b7SBarry Smith 9739566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9749566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 975aa372e3fSPaul Mullowney 976da79fbbcSStefano Zampini /* assign the pointer */ 977aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 978bda325fcSPaul Mullowney PetscFunctionReturn(0); 979bda325fcSPaul Mullowney } 980bda325fcSPaul Mullowney 9819371c9d4SSatish Balay struct PetscScalarToPetscInt { 9829371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 983a49f1ed0SStefano Zampini }; 984a49f1ed0SStefano Zampini 9859371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) { 986aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 987a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 988bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 989bda325fcSPaul Mullowney cusparseStatus_t stat; 990aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 991b175d8bbSPaul Mullowney 992bda325fcSPaul Mullowney PetscFunctionBegin; 9939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 994a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 99528b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 996a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 99708401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 9981a2c6b5cSJunchao Zhang if (A->transupdated) PetscFunctionReturn(0); 9999566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1001*48a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1002a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1003aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 10049566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1005aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 10069566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 10079566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1008aa372e3fSPaul Mullowney 1009b06137fdSPaul Mullowney /* set alpha and beta */ 10109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar))); 10119566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar))); 10129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar))); 10139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 10159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1016b06137fdSPaul Mullowney 1017aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1018aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1019a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1020554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1021554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1022aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1023a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1024aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1025aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1026a3fdcf43SKarl Rupp 1027039c6fbaSStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); } 102881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1029afb2bd1cSJunchao Zhang 1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10313606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 10329371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 10339371c9d4SSatish Balay indexBase, cusparse_scalartype); 10349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10353606e59fSJunchao Zhang #else 10363606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 10373606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 10383606e59fSJunchao Zhang 10393606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 10403606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 10413606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 10423606e59fSJunchao Zhang */ 10433606e59fSJunchao Zhang if (matrixT->num_entries) { 10449371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 10459371c9d4SSatish Balay PetscCallCUSPARSE(stat); 10463606e59fSJunchao Zhang 10473606e59fSJunchao Zhang } else { 10483606e59fSJunchao Zhang matstructT->matDescr = NULL; 10493606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 10503606e59fSJunchao Zhang } 10513606e59fSJunchao Zhang #endif 1052afb2bd1cSJunchao Zhang #endif 1053aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1054afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1055afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1056afb2bd1cSJunchao Zhang #else 1057aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 105851c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 105951c6d536SStefano Zampini /* First convert HYB to CSR */ 1060aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1061aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1062aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1063aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1064aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1065aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1066aa372e3fSPaul Mullowney 10679371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 10689371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1069aa372e3fSPaul Mullowney 1070aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1071aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1072aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1073aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1074aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1075aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1076aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1077aa372e3fSPaul Mullowney 10789371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 10799371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 10809371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1081aa372e3fSPaul Mullowney 1082aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1083aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 10849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 10859371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 10869371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 10879371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1088aa372e3fSPaul Mullowney 1089aa372e3fSPaul Mullowney /* assign the pointer */ 1090aa372e3fSPaul Mullowney matstructT->mat = hybMat; 10911a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1092aa372e3fSPaul Mullowney /* delete temporaries */ 1093aa372e3fSPaul Mullowney if (tempT) { 1094aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1095aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1096aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1097aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1098087f3262SPaul Mullowney } 1099aa372e3fSPaul Mullowney if (temp) { 1100aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1101aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1102aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1103aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1104aa372e3fSPaul Mullowney } 1105afb2bd1cSJunchao Zhang #endif 1106aa372e3fSPaul Mullowney } 1107a49f1ed0SStefano Zampini } 1108a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1109a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1110a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 111128b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 111228b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 111328b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 111428b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 111528b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 111628b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 111728b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 111828b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1119a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1120a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1121a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 11229566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1123a49f1ed0SStefano Zampini } 1124a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1125a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1126792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1127a49f1ed0SStefano Zampini 1128a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1129a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1130a49f1ed0SStefano Zampini void *csr2cscBuffer; 1131a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 11329371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 11339371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 11349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 11359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1136a49f1ed0SStefano Zampini #endif 1137a49f1ed0SStefano Zampini 11381a2c6b5cSJunchao Zhang if (matrix->num_entries) { 11391a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 11401a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 11411a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 11421a2c6b5cSJunchao Zhang 11431a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 11441a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 11451a2c6b5cSJunchao Zhang */ 11469371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1147a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11489371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 11499371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1150a49f1ed0SStefano Zampini #else 11519371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 11529371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1153a49f1ed0SStefano Zampini #endif 11541a2c6b5cSJunchao Zhang } else { 11551a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 11561a2c6b5cSJunchao Zhang } 11571a2c6b5cSJunchao Zhang 1158a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1159792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1160a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11619566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1162a49f1ed0SStefano Zampini #endif 1163a49f1ed0SStefano Zampini } 11649371c9d4SSatish Balay PetscCallThrust( 11659371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1166a49f1ed0SStefano Zampini } 11679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 11689566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1169213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1170213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1171aa372e3fSPaul Mullowney /* assign the pointer */ 1172aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 11731a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1174bda325fcSPaul Mullowney PetscFunctionReturn(0); 1175bda325fcSPaul Mullowney } 1176bda325fcSPaul Mullowney 1177a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 11789371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1179c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1180465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1181465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1182465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1183465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1184bda325fcSPaul Mullowney cusparseStatus_t stat; 1185bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1186aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1187aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1188aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1189bda325fcSPaul Mullowney 1190bda325fcSPaul Mullowney PetscFunctionBegin; 1191aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1192aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 11939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1194aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1195aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1196bda325fcSPaul Mullowney } 1197bda325fcSPaul Mullowney 1198bda325fcSPaul Mullowney /* Get the GPU pointers */ 11999566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 12009566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1201c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1202c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1203bda325fcSPaul Mullowney 12049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1205aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 12069371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1207aa372e3fSPaul Mullowney 1208aa372e3fSPaul Mullowney /* First, solve U */ 12099371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 12101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1211afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1212afb2bd1cSJunchao Zhang #endif 12139371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, 12141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12159371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 12169371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1217d49cd2b7SBarry Smith #else 12189371c9d4SSatish Balay tempGPU->data().get()); 12199371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1220afb2bd1cSJunchao Zhang #endif 1221aa372e3fSPaul Mullowney 1222aa372e3fSPaul Mullowney /* Then, solve L */ 12239371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 12241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1225afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1226afb2bd1cSJunchao Zhang #endif 12279371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1228d49cd2b7SBarry Smith tempGPU->data().get(), 12291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12309371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 12319371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1232d49cd2b7SBarry Smith #else 12339371c9d4SSatish Balay xarray); 12349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1235afb2bd1cSJunchao Zhang #endif 1236aa372e3fSPaul Mullowney 1237aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 12389371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1239aa372e3fSPaul Mullowney 1240aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1241a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1242bda325fcSPaul Mullowney 1243bda325fcSPaul Mullowney /* restore */ 12449566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 12459566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 12469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1248bda325fcSPaul Mullowney PetscFunctionReturn(0); 1249bda325fcSPaul Mullowney } 1250bda325fcSPaul Mullowney 12519371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1252465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1253465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1254bda325fcSPaul Mullowney cusparseStatus_t stat; 1255bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1256aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1257aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1258aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1259bda325fcSPaul Mullowney 1260bda325fcSPaul Mullowney PetscFunctionBegin; 1261aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1262aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 12639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1264aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1265aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1266bda325fcSPaul Mullowney } 1267bda325fcSPaul Mullowney 1268bda325fcSPaul Mullowney /* Get the GPU pointers */ 12699566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 12709566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1271bda325fcSPaul Mullowney 12729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1273aa372e3fSPaul Mullowney /* First, solve U */ 12749371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, 12751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1276afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1277afb2bd1cSJunchao Zhang #endif 12789371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, 12791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12809371c9d4SSatish Balay tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer); 12819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1282d49cd2b7SBarry Smith #else 12839371c9d4SSatish Balay tempGPU->data().get()); 12849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1285afb2bd1cSJunchao Zhang #endif 1286aa372e3fSPaul Mullowney 1287aa372e3fSPaul Mullowney /* Then, solve L */ 12889371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, 12891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1290afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1291afb2bd1cSJunchao Zhang #endif 12929371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1293d49cd2b7SBarry Smith tempGPU->data().get(), 12941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 12959371c9d4SSatish Balay xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer); 12969371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1297d49cd2b7SBarry Smith #else 12989371c9d4SSatish Balay xarray); 12999371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1300afb2bd1cSJunchao Zhang #endif 1301bda325fcSPaul Mullowney 1302bda325fcSPaul Mullowney /* restore */ 13039566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 13049566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 13059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1307bda325fcSPaul Mullowney PetscFunctionReturn(0); 1308bda325fcSPaul Mullowney } 1309bda325fcSPaul Mullowney 13109371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) { 1311465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1312465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1313465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1314465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 13159ae82921SPaul Mullowney cusparseStatus_t stat; 13169ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1317aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1318aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1319aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 13209ae82921SPaul Mullowney 13219ae82921SPaul Mullowney PetscFunctionBegin; 1322ebc8f436SDominic Meiser 1323e057df02SPaul Mullowney /* Get the GPU pointers */ 13249566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13259566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1326c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1327c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 13289ae82921SPaul Mullowney 13299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1330aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 13319371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1332aa372e3fSPaul Mullowney 1333aa372e3fSPaul Mullowney /* Next, solve L */ 13349371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 13351b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1336afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1337afb2bd1cSJunchao Zhang #endif 13389371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 1339d49cd2b7SBarry Smith tempGPU->data().get(), 13401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13419371c9d4SSatish Balay xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer); 13429371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1343d49cd2b7SBarry Smith #else 13449371c9d4SSatish Balay xarray); 13459371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1346afb2bd1cSJunchao Zhang #endif 1347aa372e3fSPaul Mullowney 1348aa372e3fSPaul Mullowney /* Then, solve U */ 13499371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 13501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1351afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1352afb2bd1cSJunchao Zhang #endif 13539371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, 13541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13559371c9d4SSatish Balay tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer); 13569371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1357d49cd2b7SBarry Smith #else 13589371c9d4SSatish Balay tempGPU->data().get()); 13599371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1360afb2bd1cSJunchao Zhang #endif 1361d49cd2b7SBarry Smith 13624e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 13639371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 13649ae82921SPaul Mullowney 13659566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 13669566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 13679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 13699ae82921SPaul Mullowney PetscFunctionReturn(0); 13709ae82921SPaul Mullowney } 13719ae82921SPaul Mullowney 13729371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) { 1373465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1374465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 13759ae82921SPaul Mullowney cusparseStatus_t stat; 13769ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1377aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1378aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1379aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 13809ae82921SPaul Mullowney 13819ae82921SPaul Mullowney PetscFunctionBegin; 1382e057df02SPaul Mullowney /* Get the GPU pointers */ 13839566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 13849566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 13859ae82921SPaul Mullowney 13869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1387aa372e3fSPaul Mullowney /* First, solve L */ 13889371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, 13891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1390afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1391afb2bd1cSJunchao Zhang #endif 13929371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, 13931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 13949371c9d4SSatish Balay tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer); 13959371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1396d49cd2b7SBarry Smith #else 13979371c9d4SSatish Balay tempGPU->data().get()); 13989371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1399afb2bd1cSJunchao Zhang #endif 1400d49cd2b7SBarry Smith 1401aa372e3fSPaul Mullowney /* Next, solve U */ 14029371c9d4SSatish Balay stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, 14031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1404afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1405afb2bd1cSJunchao Zhang #endif 14069371c9d4SSatish Balay &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 1407d49cd2b7SBarry Smith tempGPU->data().get(), 14081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 14099371c9d4SSatish Balay xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer); 14109371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1411d49cd2b7SBarry Smith #else 14129371c9d4SSatish Balay xarray); 14139371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1414afb2bd1cSJunchao Zhang #endif 14159ae82921SPaul Mullowney 14169566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 14179566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 14189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 14209ae82921SPaul Mullowney PetscFunctionReturn(0); 14219ae82921SPaul Mullowney } 14229ae82921SPaul Mullowney 1423da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1424da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */ 14259371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1426da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1427da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1428da112707SJunchao Zhang const PetscScalar *barray; 1429da112707SJunchao Zhang PetscScalar *xarray; 1430da112707SJunchao Zhang 1431da112707SJunchao Zhang PetscFunctionBegin; 1432da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1433da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1434da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1435da112707SJunchao Zhang 1436da112707SJunchao Zhang /* Solve L*y = b */ 1437da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1438da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 14399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 14409371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, 144112ba2bc6SJunchao Zhang fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()! 1442da112707SJunchao Zhang 1443da112707SJunchao Zhang /* Solve U*x = y */ 1444da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 14459371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */ 14469371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U)); 1447da112707SJunchao Zhang 1448da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450da112707SJunchao Zhang 1451da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1452da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1453da112707SJunchao Zhang PetscFunctionReturn(0); 1454da112707SJunchao Zhang } 1455da112707SJunchao Zhang 14569371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) { 1457da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1458da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1459da112707SJunchao Zhang const PetscScalar *barray; 1460da112707SJunchao Zhang PetscScalar *xarray; 1461da112707SJunchao Zhang 1462da112707SJunchao Zhang PetscFunctionBegin; 146312ba2bc6SJunchao Zhang if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */ 1464da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 14659371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */ 14669371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1467da112707SJunchao Zhang 1468da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 14699371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1470da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 147112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 147212ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 147312ba2bc6SJunchao Zhang } 1474da112707SJunchao Zhang 147512ba2bc6SJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 14769371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1477da112707SJunchao Zhang 14789371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 147912ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1480da112707SJunchao Zhang } 1481da112707SJunchao Zhang 1482da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1483da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1484da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1485da112707SJunchao Zhang 1486da112707SJunchao Zhang /* Solve Ut*y = b */ 1487da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1488da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 14899371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */ 14909371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut)); 1491da112707SJunchao Zhang 1492da112707SJunchao Zhang /* Solve Lt*x = y */ 1493da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 14949371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 14959371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1496da112707SJunchao Zhang 1497da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1498da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1499da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1500da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1501da112707SJunchao Zhang PetscFunctionReturn(0); 1502da112707SJunchao Zhang } 1503da112707SJunchao Zhang 15049371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) { 1505da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1506da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1507da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1508da112707SJunchao Zhang CsrMatrix *Acsr; 1509da112707SJunchao Zhang PetscInt m, nz; 1510da112707SJunchao Zhang PetscBool flg; 1511da112707SJunchao Zhang 1512da112707SJunchao Zhang PetscFunctionBegin; 1513da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1514da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1515da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1516da112707SJunchao Zhang } 1517da112707SJunchao Zhang 1518da112707SJunchao Zhang /* Copy A's value to fact */ 1519da112707SJunchao Zhang m = fact->rmap->n; 1520da112707SJunchao Zhang nz = aij->nz; 1521da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1522da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1523da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1524da112707SJunchao Zhang 1525da112707SJunchao Zhang /* Factorize fact inplace */ 15269371c9d4SSatish Balay if (m) 15279371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 15289371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1529da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1530da112707SJunchao Zhang int numerical_zero; 1531da112707SJunchao Zhang cusparseStatus_t status; 1532da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1533da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1534da112707SJunchao Zhang } 1535da112707SJunchao Zhang 153612ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 153712ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 153812ba2bc6SJunchao Zhang */ 15399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1540da112707SJunchao Zhang 15419371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1542da112707SJunchao Zhang 154312ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 154412ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 154512ba2bc6SJunchao Zhang 1546da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1547da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ILU0; 1548da112707SJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_ILU0; 1549da112707SJunchao Zhang fact->ops->matsolve = NULL; 1550da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1551da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1552da112707SJunchao Zhang PetscFunctionReturn(0); 1553da112707SJunchao Zhang } 1554da112707SJunchao Zhang 15559371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1556da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1557da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1558da112707SJunchao Zhang PetscInt m, nz; 1559da112707SJunchao Zhang 1560da112707SJunchao Zhang PetscFunctionBegin; 1561da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1562da112707SJunchao Zhang PetscInt i; 1563da112707SJunchao Zhang PetscBool flg, missing; 1564da112707SJunchao Zhang 1565da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1566da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1567da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1568da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1569da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1570da112707SJunchao Zhang } 1571da112707SJunchao Zhang 1572da112707SJunchao Zhang /* Free the old stale stuff */ 1573da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1574da112707SJunchao Zhang 1575da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1576da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1577da112707SJunchao Zhang */ 1578da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1579da112707SJunchao Zhang 1580da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1581da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1582da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1583da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1584da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1585da112707SJunchao Zhang 1586da112707SJunchao Zhang aij->row = NULL; 1587da112707SJunchao Zhang aij->col = NULL; 1588da112707SJunchao Zhang 1589da112707SJunchao Zhang /* ====================================================================== */ 1590da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1591da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1592da112707SJunchao Zhang /* ====================================================================== */ 1593da112707SJunchao Zhang const int *Ai, *Aj; 1594da112707SJunchao Zhang 1595da112707SJunchao Zhang m = fact->rmap->n; 1596da112707SJunchao Zhang nz = aij->nz; 1597da112707SJunchao Zhang 1598da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1599da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1600da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1601da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1602da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1603da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1604da112707SJunchao Zhang 1605da112707SJunchao Zhang /* ====================================================================== */ 1606da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1607da112707SJunchao Zhang /* ====================================================================== */ 1608da112707SJunchao Zhang cusparseFillMode_t fillMode; 1609da112707SJunchao Zhang cusparseDiagType_t diagType; 1610da112707SJunchao Zhang 1611da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1612da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1613da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1614da112707SJunchao Zhang 1615da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1616da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1617da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1618da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1619da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1620da112707SJunchao Zhang */ 1621da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1622da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 16239371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16249371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1626da112707SJunchao Zhang 1627da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1628da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 16299371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 16309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 16319371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1632da112707SJunchao Zhang 1633da112707SJunchao Zhang /* ========================================================================= */ 1634da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1635da112707SJunchao Zhang /* ========================================================================= */ 1636da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 16379371c9d4SSatish Balay if (m) 16389371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 16399371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M)); 1640da112707SJunchao Zhang 1641da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1642da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1643da112707SJunchao Zhang 1644da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1645da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1646da112707SJunchao Zhang 1647da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 16489371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1649da112707SJunchao Zhang 1650da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 16519371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1652da112707SJunchao Zhang 1653da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 165412ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 165512ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 165612ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1657da112707SJunchao Zhang */ 165812ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 165912ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 166012ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1661da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 166212ba2bc6SJunchao Zhang } else { 166312ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 166412ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1665da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 166612ba2bc6SJunchao Zhang } 1667da112707SJunchao Zhang 1668da112707SJunchao Zhang /* ========================================================================== */ 1669da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1670da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1671da112707SJunchao Zhang /* ========================================================================== */ 1672da112707SJunchao Zhang int structural_zero; 1673da112707SJunchao Zhang cusparseStatus_t status; 1674da112707SJunchao Zhang 1675da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 16769371c9d4SSatish Balay if (m) 16779371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 16789371c9d4SSatish Balay fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1679da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1680da112707SJunchao Zhang /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1681da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1682da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1683da112707SJunchao Zhang } 1684da112707SJunchao Zhang 1685da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 16860dd8c0acSJunchao Zhang { 1687da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 16880dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1689da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1690da112707SJunchao Zhang 1691da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1692da112707SJunchao Zhang Ai = Aseq->i; 1693da112707SJunchao Zhang Adiag = Aseq->diag; 1694da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1695da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1696da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1697da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1698da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1699da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1700da112707SJunchao Zhang */ 1701da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1702da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1703da112707SJunchao Zhang } 1704da112707SJunchao Zhang } 1705da112707SJunchao Zhang fs->numericFactFlops = flops; 17060dd8c0acSJunchao Zhang } 1707da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1708da112707SJunchao Zhang PetscFunctionReturn(0); 1709da112707SJunchao Zhang } 1710da112707SJunchao Zhang 17119371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) { 1712da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1713da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1714da112707SJunchao Zhang const PetscScalar *barray; 1715da112707SJunchao Zhang PetscScalar *xarray; 1716da112707SJunchao Zhang 1717da112707SJunchao Zhang PetscFunctionBegin; 1718da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1719da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1720da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1721da112707SJunchao Zhang 1722da112707SJunchao Zhang /* Solve L*y = b */ 1723da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1724da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 17259371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 17269371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1727da112707SJunchao Zhang 1728da112707SJunchao Zhang /* Solve Lt*x = y */ 1729da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 17309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 17319371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1732da112707SJunchao Zhang 1733da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1734da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1735da112707SJunchao Zhang 1736da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1737da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1738da112707SJunchao Zhang PetscFunctionReturn(0); 1739da112707SJunchao Zhang } 1740da112707SJunchao Zhang 17419371c9d4SSatish Balay static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) { 1742da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1743da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1744da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1745da112707SJunchao Zhang CsrMatrix *Acsr; 1746da112707SJunchao Zhang PetscInt m, nz; 1747da112707SJunchao Zhang PetscBool flg; 1748da112707SJunchao Zhang 1749da112707SJunchao Zhang PetscFunctionBegin; 1750da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1751da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1752da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1753da112707SJunchao Zhang } 1754da112707SJunchao Zhang 1755da112707SJunchao Zhang /* Copy A's value to fact */ 1756da112707SJunchao Zhang m = fact->rmap->n; 1757da112707SJunchao Zhang nz = aij->nz; 1758da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1759da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1760da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1761da112707SJunchao Zhang 1762da112707SJunchao Zhang /* Factorize fact inplace */ 1763da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1764da112707SJunchao Zhang Function csric02() only takes the lower triangular part of matrix A to perform factorization. 1765da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1766da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1767da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1768da112707SJunchao Zhang */ 17699371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1770da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1771da112707SJunchao Zhang int numerical_zero; 1772da112707SJunchao Zhang cusparseStatus_t status; 1773da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1774da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1775da112707SJunchao Zhang } 1776da112707SJunchao Zhang 17779371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1778da112707SJunchao Zhang 1779da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1780da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1781da112707SJunchao Zhang */ 17829371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1783da112707SJunchao Zhang 1784da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1785da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1786da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1787da112707SJunchao Zhang fact->ops->matsolve = NULL; 1788da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1789da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1790da112707SJunchao Zhang PetscFunctionReturn(0); 1791da112707SJunchao Zhang } 1792da112707SJunchao Zhang 17939371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) { 1794da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1795da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1796da112707SJunchao Zhang PetscInt m, nz; 1797da112707SJunchao Zhang 1798da112707SJunchao Zhang PetscFunctionBegin; 1799da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1800da112707SJunchao Zhang PetscInt i; 1801da112707SJunchao Zhang PetscBool flg, missing; 1802da112707SJunchao Zhang 1803da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1804da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1805da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1806da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1807da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1808da112707SJunchao Zhang } 1809da112707SJunchao Zhang 1810da112707SJunchao Zhang /* Free the old stale stuff */ 1811da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1812da112707SJunchao Zhang 1813da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1814da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1815da112707SJunchao Zhang */ 1816da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1817da112707SJunchao Zhang 1818da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1819da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 1820da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1821da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1822da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1823da112707SJunchao Zhang 1824da112707SJunchao Zhang aij->row = NULL; 1825da112707SJunchao Zhang aij->col = NULL; 1826da112707SJunchao Zhang 1827da112707SJunchao Zhang /* ====================================================================== */ 1828da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1829da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1830da112707SJunchao Zhang /* ====================================================================== */ 1831da112707SJunchao Zhang const int *Ai, *Aj; 1832da112707SJunchao Zhang 1833da112707SJunchao Zhang m = fact->rmap->n; 1834da112707SJunchao Zhang nz = aij->nz; 1835da112707SJunchao Zhang 1836da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1))); 1837da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz)); 1838da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 1839da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 1840da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1841da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1842da112707SJunchao Zhang 1843da112707SJunchao Zhang /* ====================================================================== */ 1844da112707SJunchao Zhang /* Create mat descriptors for M, L */ 1845da112707SJunchao Zhang /* ====================================================================== */ 1846da112707SJunchao Zhang cusparseFillMode_t fillMode; 1847da112707SJunchao Zhang cusparseDiagType_t diagType; 1848da112707SJunchao Zhang 1849da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1850da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1851da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1852da112707SJunchao Zhang 1853da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1854da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1855da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1856da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1857da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1858da112707SJunchao Zhang */ 1859da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1860da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 18619371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18629371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1864da112707SJunchao Zhang 1865da112707SJunchao Zhang /* ========================================================================= */ 1866da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 1867da112707SJunchao Zhang /* ========================================================================= */ 1868da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 18699371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M)); 1870da112707SJunchao Zhang 1871da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1872da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1873da112707SJunchao Zhang 1874da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1875da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1876da112707SJunchao Zhang 1877da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18789371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1879da112707SJunchao Zhang 1880da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 18819371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1882da112707SJunchao Zhang 188312ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 188412ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 188512ba2bc6SJunchao Zhang */ 188612ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 188712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 188812ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1889da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 189012ba2bc6SJunchao Zhang } else { 189112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 189212ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 189312ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 189412ba2bc6SJunchao Zhang } 1895da112707SJunchao Zhang 1896da112707SJunchao Zhang /* ========================================================================== */ 1897da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 1898da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 1899da112707SJunchao Zhang /* ========================================================================== */ 1900da112707SJunchao Zhang int structural_zero; 1901da112707SJunchao Zhang cusparseStatus_t status; 1902da112707SJunchao Zhang 1903da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 19049371c9d4SSatish Balay if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1905da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1906da112707SJunchao Zhang /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1907da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 1908da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 1909da112707SJunchao Zhang } 1910da112707SJunchao Zhang 1911da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 19120dd8c0acSJunchao Zhang { 1913da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 19140dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 1915da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1916da112707SJunchao Zhang 1917da112707SJunchao Zhang Ai = Aseq->i; 1918da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1919da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1920da112707SJunchao Zhang if (nzRow > 1) { 1921da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1922da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1923da112707SJunchao Zhang */ 1924da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1925da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1926da112707SJunchao Zhang } 1927da112707SJunchao Zhang } 1928da112707SJunchao Zhang fs->numericFactFlops = flops; 19290dd8c0acSJunchao Zhang } 1930da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 1931da112707SJunchao Zhang PetscFunctionReturn(0); 1932da112707SJunchao Zhang } 1933da112707SJunchao Zhang #endif 1934da112707SJunchao Zhang 19359371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1936da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1937da112707SJunchao Zhang 1938da112707SJunchao Zhang PetscFunctionBegin; 1939da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1940bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 1941bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) { 1942da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 1943da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 1944bc996fdcSJunchao Zhang } 1945da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 1946da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 1947da112707SJunchao Zhang } else 1948da112707SJunchao Zhang #endif 1949da112707SJunchao Zhang { 1950da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1951da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1952da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1953da112707SJunchao Zhang } 1954da112707SJunchao Zhang PetscFunctionReturn(0); 1955da112707SJunchao Zhang } 1956da112707SJunchao Zhang 19579371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1958da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1959da112707SJunchao Zhang 1960da112707SJunchao Zhang PetscFunctionBegin; 1961da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1962da112707SJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 1963da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 1964da112707SJunchao Zhang PetscFunctionReturn(0); 1965da112707SJunchao Zhang } 1966da112707SJunchao Zhang 19679371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1968da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1969da112707SJunchao Zhang 1970da112707SJunchao Zhang PetscFunctionBegin; 1971da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 1972bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 1973bc996fdcSJunchao Zhang if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity)); 1974da112707SJunchao Zhang if (!info->levels && perm_identity) { 1975da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 1976da112707SJunchao Zhang } else 1977da112707SJunchao Zhang #endif 1978da112707SJunchao Zhang { 1979da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1980da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 1981da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1982da112707SJunchao Zhang } 1983da112707SJunchao Zhang PetscFunctionReturn(0); 1984da112707SJunchao Zhang } 1985da112707SJunchao Zhang 19869371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) { 1987da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 1988da112707SJunchao Zhang 1989da112707SJunchao Zhang PetscFunctionBegin; 1990da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 1991da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 1992da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 1993da112707SJunchao Zhang PetscFunctionReturn(0); 1994da112707SJunchao Zhang } 1995da112707SJunchao Zhang 19969371c9d4SSatish Balay PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) { 1997841d4cb1SJunchao Zhang PetscFunctionBegin; 1998841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 1999841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2000841d4cb1SJunchao Zhang } 2001841d4cb1SJunchao Zhang 2002841d4cb1SJunchao Zhang /*MC 2003841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2004841d4cb1SJunchao Zhang on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 2005841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2006841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2007841d4cb1SJunchao Zhang CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2008841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2009841d4cb1SJunchao Zhang 2010841d4cb1SJunchao Zhang Level: beginner 2011841d4cb1SJunchao Zhang 2012841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2013841d4cb1SJunchao Zhang M*/ 2014841d4cb1SJunchao Zhang 20159371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) { 2016841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2017bc996fdcSJunchao Zhang PetscBool factOnDevice, factOnHost; 2018bc996fdcSJunchao Zhang char *prefix; 2019bc996fdcSJunchao Zhang char factPlace[32] = "device"; /* the default */ 2020841d4cb1SJunchao Zhang 2021841d4cb1SJunchao Zhang PetscFunctionBegin; 2022841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2023841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2024841d4cb1SJunchao Zhang (*B)->factortype = ftype; 2025841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2026841d4cb1SJunchao Zhang 2027bc996fdcSJunchao Zhang prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix; 2028bc996fdcSJunchao Zhang PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat"); 2029bc996fdcSJunchao Zhang PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL)); 2030bc996fdcSJunchao Zhang PetscOptionsEnd(); 2031bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice)); 2032bc996fdcSJunchao Zhang PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost)); 2033bc996fdcSJunchao Zhang PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace); 2034bc996fdcSJunchao Zhang ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice; 2035bc996fdcSJunchao Zhang 2036841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2037841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2038841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2039841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2040841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2041841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2042841d4cb1SJunchao Zhang } else { 2043841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2044841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2045841d4cb1SJunchao Zhang } 2046841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2047841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2048841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2049841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2050841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2051841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2052841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2053841d4cb1SJunchao Zhang } else { 2054841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2055841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2056841d4cb1SJunchao Zhang } 2057841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2058841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2059841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2060841d4cb1SJunchao Zhang 2061841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2062841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2063841d4cb1SJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2064841d4cb1SJunchao Zhang PetscFunctionReturn(0); 2065841d4cb1SJunchao Zhang } 2066841d4cb1SJunchao Zhang 20679371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) { 20687e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 20697e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 20700dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500 2071da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 20720dd8c0acSJunchao Zhang #endif 20737e8381f9SStefano Zampini 20747e8381f9SStefano Zampini PetscFunctionBegin; 20757e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 20769566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2077da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2078da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 20799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2080da112707SJunchao Zhang } 2081da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500 2082da112707SJunchao Zhang else if (fs->csrVal) { 2083da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2084da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2085da112707SJunchao Zhang } 2086da112707SJunchao Zhang #endif 20879371c9d4SSatish Balay else 20889371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 20899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 20909566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 20917e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 20927e8381f9SStefano Zampini } 20937e8381f9SStefano Zampini PetscFunctionReturn(0); 20947e8381f9SStefano Zampini } 20957e8381f9SStefano Zampini 20969371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 20977e8381f9SStefano Zampini PetscFunctionBegin; 20989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 209967a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 210067a45760SJunchao Zhang PetscFunctionReturn(0); 210167a45760SJunchao Zhang } 210267a45760SJunchao Zhang 21039371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 210467a45760SJunchao Zhang PetscFunctionBegin; 21057e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 210667a45760SJunchao Zhang *array = NULL; 210767a45760SJunchao Zhang PetscFunctionReturn(0); 210867a45760SJunchao Zhang } 210967a45760SJunchao Zhang 21109371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 211167a45760SJunchao Zhang PetscFunctionBegin; 21129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 211367a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 211467a45760SJunchao Zhang PetscFunctionReturn(0); 211567a45760SJunchao Zhang } 211667a45760SJunchao Zhang 21179371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) { 211867a45760SJunchao Zhang PetscFunctionBegin; 211967a45760SJunchao Zhang *array = NULL; 212067a45760SJunchao Zhang PetscFunctionReturn(0); 212167a45760SJunchao Zhang } 212267a45760SJunchao Zhang 21239371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 212467a45760SJunchao Zhang PetscFunctionBegin; 212567a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 212667a45760SJunchao Zhang PetscFunctionReturn(0); 212767a45760SJunchao Zhang } 212867a45760SJunchao Zhang 21299371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) { 213067a45760SJunchao Zhang PetscFunctionBegin; 213167a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 213267a45760SJunchao Zhang *array = NULL; 21337e8381f9SStefano Zampini PetscFunctionReturn(0); 21347e8381f9SStefano Zampini } 21357e8381f9SStefano Zampini 21369371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 21377ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 21387ee59b9bSJunchao Zhang CsrMatrix *matrix; 21397ee59b9bSJunchao Zhang 21407ee59b9bSJunchao Zhang PetscFunctionBegin; 21417ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 21427ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 21437ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 21447ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 21457ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 21467ee59b9bSJunchao Zhang 21477ee59b9bSJunchao Zhang if (i) { 21487ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 21497ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 21507ee59b9bSJunchao Zhang #else 21517ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 21527ee59b9bSJunchao Zhang #endif 21537ee59b9bSJunchao Zhang } 21547ee59b9bSJunchao Zhang if (j) { 21557ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 21567ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 21577ee59b9bSJunchao Zhang #else 21587ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 21597ee59b9bSJunchao Zhang #endif 21607ee59b9bSJunchao Zhang } 21617ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 21627ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 21637ee59b9bSJunchao Zhang PetscFunctionReturn(0); 21647ee59b9bSJunchao Zhang } 21657ee59b9bSJunchao Zhang 21669371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) { 2167aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 21687c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 21699ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2170213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2171aa372e3fSPaul Mullowney cusparseStatus_t stat; 2172abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 21739ae82921SPaul Mullowney 21749ae82921SPaul Mullowney PetscFunctionBegin; 217528b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2176c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2177a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2178a49f1ed0SStefano Zampini CsrMatrix *matrix; 2179afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 218085ba7357SStefano Zampini 218108401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 21829566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2183afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 21849566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 21859566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar))); 21869566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 21879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 218834d6c7a5SJose E. Roman } else { 2189abb89eb1SStefano Zampini PetscInt nnz; 21909566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 21919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 21929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 21937c700b8dSJunchao Zhang delete cusparsestruct->workVector; 219481902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2195a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2196a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 21979ae82921SPaul Mullowney try { 21989ae82921SPaul Mullowney if (a->compressedrow.use) { 21999ae82921SPaul Mullowney m = a->compressedrow.nrows; 22009ae82921SPaul Mullowney ii = a->compressedrow.i; 22019ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 22029ae82921SPaul Mullowney } else { 2203213423ffSJunchao Zhang m = A->rmap->n; 2204213423ffSJunchao Zhang ii = a->i; 2205e6e9a74fSStefano Zampini ridx = NULL; 22069ae82921SPaul Mullowney } 220708401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 22089371c9d4SSatish Balay if (!a->a) { 22099371c9d4SSatish Balay nnz = ii[m]; 22109371c9d4SSatish Balay both = PETSC_FALSE; 22119371c9d4SSatish Balay } else nnz = a->nz; 221208401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 22139ae82921SPaul Mullowney 221485ba7357SStefano Zampini /* create cusparse matrix */ 2215abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2216aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 22179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 22189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 22199566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 22209ae82921SPaul Mullowney 22219566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar))); 22229566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar))); 22239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar))); 22249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22269566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 22279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2228b06137fdSPaul Mullowney 2229aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2230aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2231aa372e3fSPaul Mullowney /* set the matrix */ 2232afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2233afb2bd1cSJunchao Zhang mat->num_rows = m; 2234afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2235abb89eb1SStefano Zampini mat->num_entries = nnz; 2236afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2237afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 22389ae82921SPaul Mullowney 2239abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2240abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2241aa372e3fSPaul Mullowney 2242abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2243abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2244aa372e3fSPaul Mullowney 2245aa372e3fSPaul Mullowney /* assign the pointer */ 2246afb2bd1cSJunchao Zhang matstruct->mat = mat; 2247afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2248afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 22499371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 22509371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 22519371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2252afb2bd1cSJunchao Zhang } 2253afb2bd1cSJunchao Zhang #endif 2254aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2255afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2256afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2257afb2bd1cSJunchao Zhang #else 2258afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2259afb2bd1cSJunchao Zhang mat->num_rows = m; 2260afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2261abb89eb1SStefano Zampini mat->num_entries = nnz; 2262afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m + 1); 2263afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2264aa372e3fSPaul Mullowney 2265abb89eb1SStefano Zampini mat->column_indices = new THRUSTINTARRAY32(nnz); 2266abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2267aa372e3fSPaul Mullowney 2268abb89eb1SStefano Zampini mat->values = new THRUSTARRAY(nnz); 2269abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2270aa372e3fSPaul Mullowney 2271aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 22729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 22739371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 22749371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 22759371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2276aa372e3fSPaul Mullowney /* assign the pointer */ 2277aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2278aa372e3fSPaul Mullowney 2279afb2bd1cSJunchao Zhang if (mat) { 2280afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2281afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2282afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2283afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2284087f3262SPaul Mullowney } 2285afb2bd1cSJunchao Zhang #endif 2286087f3262SPaul Mullowney } 2287ca45077fSPaul Mullowney 2288aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2289213423ffSJunchao Zhang if (a->compressedrow.use) { 2290213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 2291aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 2292aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2293213423ffSJunchao Zhang tmp = m; 2294213423ffSJunchao Zhang } else { 2295213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2296213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2297213423ffSJunchao Zhang tmp = 0; 2298213423ffSJunchao Zhang } 22999566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2300aa372e3fSPaul Mullowney 2301aa372e3fSPaul Mullowney /* assign the pointer */ 2302aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 23039371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 23049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 23059566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 230634d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 230734d6c7a5SJose E. Roman } 2308abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 23099ae82921SPaul Mullowney } 23109ae82921SPaul Mullowney PetscFunctionReturn(0); 23119ae82921SPaul Mullowney } 23129ae82921SPaul Mullowney 23139371c9d4SSatish Balay struct VecCUDAPlusEquals { 2314aa372e3fSPaul Mullowney template <typename Tuple> 23159371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 2316aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2317aa372e3fSPaul Mullowney } 2318aa372e3fSPaul Mullowney }; 2319aa372e3fSPaul Mullowney 23209371c9d4SSatish Balay struct VecCUDAEquals { 23217e8381f9SStefano Zampini template <typename Tuple> 23229371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 23237e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 23247e8381f9SStefano Zampini } 23257e8381f9SStefano Zampini }; 23267e8381f9SStefano Zampini 23279371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2328e6e9a74fSStefano Zampini template <typename Tuple> 23299371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 2330e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2331e6e9a74fSStefano Zampini } 2332e6e9a74fSStefano Zampini }; 2333e6e9a74fSStefano Zampini 2334afb2bd1cSJunchao Zhang struct MatMatCusparse { 2335ccdfe979SStefano Zampini PetscBool cisdense; 2336ccdfe979SStefano Zampini PetscScalar *Bt; 2337ccdfe979SStefano Zampini Mat X; 2338fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2339fcdce8c4SStefano Zampini PetscLogDouble flops; 2340fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2341b4285af6SJunchao Zhang 2342afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2343fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2344afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2345afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2346afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2347afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2348b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2349b4285af6SJunchao Zhang void *dBuffer4; 2350b4285af6SJunchao Zhang void *dBuffer5; 2351b4285af6SJunchao Zhang #endif 2352fcdce8c4SStefano Zampini size_t mmBufferSize; 2353fcdce8c4SStefano Zampini void *mmBuffer; 2354fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2355fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2356afb2bd1cSJunchao Zhang #endif 2357afb2bd1cSJunchao Zhang }; 2358ccdfe979SStefano Zampini 23599371c9d4SSatish Balay static PetscErrorCode MatDestroy_MatMatCusparse(void *data) { 2360ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2361ccdfe979SStefano Zampini 2362ccdfe979SStefano Zampini PetscFunctionBegin; 23639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2364fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2365afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 23669566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 23679566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 23689566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 23699566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2370b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 23719566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 23729566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2373b4285af6SJunchao Zhang #endif 23749566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 23759566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2376afb2bd1cSJunchao Zhang #endif 23779566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 23789566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 2379ccdfe979SStefano Zampini PetscFunctionReturn(0); 2380ccdfe979SStefano Zampini } 2381ccdfe979SStefano Zampini 2382ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool); 2383ccdfe979SStefano Zampini 23849371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2385ccdfe979SStefano Zampini Mat_Product *product = C->product; 2386ccdfe979SStefano Zampini Mat A, B; 2387afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2388ccdfe979SStefano Zampini PetscBool flg, biscuda; 2389ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2390ccdfe979SStefano Zampini cusparseStatus_t stat; 2391ccdfe979SStefano Zampini cusparseOperation_t opA; 2392ccdfe979SStefano Zampini const PetscScalar *barray; 2393ccdfe979SStefano Zampini PetscScalar *carray; 2394ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2395ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2396ccdfe979SStefano Zampini CsrMatrix *csrmat; 2397ccdfe979SStefano Zampini 2398ccdfe979SStefano Zampini PetscFunctionBegin; 2399ccdfe979SStefano Zampini MatCheckProduct(C, 1); 240028b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2401ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2402ccdfe979SStefano Zampini A = product->A; 2403ccdfe979SStefano Zampini B = product->B; 24049566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 240528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2406ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2407ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 240828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 24099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2410ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2411ccdfe979SStefano Zampini switch (product->type) { 2412ccdfe979SStefano Zampini case MATPRODUCT_AB: 2413ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2414ccdfe979SStefano Zampini mat = cusp->mat; 2415ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2416ccdfe979SStefano Zampini m = A->rmap->n; 2417ccdfe979SStefano Zampini n = B->cmap->n; 2418ccdfe979SStefano Zampini break; 2419ccdfe979SStefano Zampini case MATPRODUCT_AtB: 24201a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2421e6e9a74fSStefano Zampini mat = cusp->mat; 2422e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2423e6e9a74fSStefano Zampini } else { 24249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2425ccdfe979SStefano Zampini mat = cusp->matTranspose; 2426ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2427e6e9a74fSStefano Zampini } 2428ccdfe979SStefano Zampini m = A->cmap->n; 2429ccdfe979SStefano Zampini n = B->cmap->n; 2430ccdfe979SStefano Zampini break; 2431ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2432ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2433ccdfe979SStefano Zampini mat = cusp->mat; 2434ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2435ccdfe979SStefano Zampini m = A->rmap->n; 2436ccdfe979SStefano Zampini n = B->rmap->n; 2437ccdfe979SStefano Zampini break; 24389371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2439ccdfe979SStefano Zampini } 244028b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2441ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2442ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 24439566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 24449566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 24459566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayRead(B, &barray)); 2446afb2bd1cSJunchao Zhang 24479566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2448c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 24499566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray)); 24509566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2451c8378d12SStefano Zampini } else { 24529566063dSJacob Faibussowitsch PetscCall(MatDenseCUDAGetArrayWrite(C, &carray)); 24539566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2454c8378d12SStefano Zampini } 2455c8378d12SStefano Zampini 24569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2457afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2458afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2459a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2460afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2461fcdce8c4SStefano Zampini size_t mmBufferSize; 24629371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 24639371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 24649371c9d4SSatish Balay mmdata->matBDescr = NULL; 24659371c9d4SSatish Balay } 2466afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 24679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2468afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2469afb2bd1cSJunchao Zhang } 2470c8378d12SStefano Zampini 24719371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 24729371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 24739371c9d4SSatish Balay mmdata->matCDescr = NULL; 24749371c9d4SSatish Balay } 2475afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 24769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2477afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2478afb2bd1cSJunchao Zhang } 2479afb2bd1cSJunchao Zhang 2480afb2bd1cSJunchao Zhang if (!mat->matDescr) { 24819371c9d4SSatish Balay stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 24829371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 24839371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2484afb2bd1cSJunchao Zhang } 24859371c9d4SSatish Balay stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize); 24869371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2487fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 24889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 24899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2490fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2491fcdce8c4SStefano Zampini } 2492afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2493afb2bd1cSJunchao Zhang } else { 2494afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 24959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get())); 24969566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 24979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2498afb2bd1cSJunchao Zhang } 2499afb2bd1cSJunchao Zhang 2500afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 25019371c9d4SSatish Balay stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer); 25029371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2503afb2bd1cSJunchao Zhang #else 2504afb2bd1cSJunchao Zhang PetscInt k; 2505afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2506ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2507ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2508ccdfe979SStefano Zampini cublasStatus_t cerr; 2509ccdfe979SStefano Zampini 25109566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 25119371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 25129371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2513ccdfe979SStefano Zampini blda = B->cmap->n; 2514afb2bd1cSJunchao Zhang k = B->cmap->n; 2515afb2bd1cSJunchao Zhang } else { 2516afb2bd1cSJunchao Zhang k = B->rmap->n; 2517ccdfe979SStefano Zampini } 2518ccdfe979SStefano Zampini 2519afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 25209371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 25219371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2522afb2bd1cSJunchao Zhang #endif 25239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 25249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 25259566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayRead(B, &barray)); 2526ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 25279566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25289566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2529ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 25309566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray)); 25319566063dSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2532ccdfe979SStefano Zampini } else { 25339566063dSJacob Faibussowitsch PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray)); 2534ccdfe979SStefano Zampini } 2535*48a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2536*48a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2537ccdfe979SStefano Zampini PetscFunctionReturn(0); 2538ccdfe979SStefano Zampini } 2539ccdfe979SStefano Zampini 25409371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) { 2541ccdfe979SStefano Zampini Mat_Product *product = C->product; 2542ccdfe979SStefano Zampini Mat A, B; 2543ccdfe979SStefano Zampini PetscInt m, n; 2544ccdfe979SStefano Zampini PetscBool cisdense, flg; 2545ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2546ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2547ccdfe979SStefano Zampini 2548ccdfe979SStefano Zampini PetscFunctionBegin; 2549ccdfe979SStefano Zampini MatCheckProduct(C, 1); 255028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2551ccdfe979SStefano Zampini A = product->A; 2552ccdfe979SStefano Zampini B = product->B; 25539566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 255428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2555ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 255608401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2557ccdfe979SStefano Zampini switch (product->type) { 2558ccdfe979SStefano Zampini case MATPRODUCT_AB: 2559ccdfe979SStefano Zampini m = A->rmap->n; 2560ccdfe979SStefano Zampini n = B->cmap->n; 2561ccdfe979SStefano Zampini break; 2562ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2563ccdfe979SStefano Zampini m = A->cmap->n; 2564ccdfe979SStefano Zampini n = B->cmap->n; 2565ccdfe979SStefano Zampini break; 2566ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2567ccdfe979SStefano Zampini m = A->rmap->n; 2568ccdfe979SStefano Zampini n = B->rmap->n; 2569ccdfe979SStefano Zampini break; 2570ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2571ccdfe979SStefano Zampini m = B->cmap->n; 2572ccdfe979SStefano Zampini n = B->cmap->n; 2573ccdfe979SStefano Zampini break; 2574ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2575ccdfe979SStefano Zampini m = B->rmap->n; 2576ccdfe979SStefano Zampini n = B->rmap->n; 2577ccdfe979SStefano Zampini break; 25789371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2579ccdfe979SStefano Zampini } 25809566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2581ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 25829566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 25839566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2584ccdfe979SStefano Zampini 2585ccdfe979SStefano Zampini /* product data */ 25869566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2587ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2588afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2589afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2590*48a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2591afb2bd1cSJunchao Zhang #endif 2592ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2593ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 25949566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 25959566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2596ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 25979566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2598ccdfe979SStefano Zampini } else { 25999566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2600ccdfe979SStefano Zampini } 2601ccdfe979SStefano Zampini } 2602ccdfe979SStefano Zampini C->product->data = mmdata; 2603ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2604ccdfe979SStefano Zampini 2605ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2606ccdfe979SStefano Zampini PetscFunctionReturn(0); 2607ccdfe979SStefano Zampini } 2608ccdfe979SStefano Zampini 26099371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2610ccdfe979SStefano Zampini Mat_Product *product = C->product; 2611fcdce8c4SStefano Zampini Mat A, B; 2612fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2613fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2614fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2615fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2616fcdce8c4SStefano Zampini PetscBool flg; 2617fcdce8c4SStefano Zampini cusparseStatus_t stat; 2618fcdce8c4SStefano Zampini MatProductType ptype; 2619fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2620fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2621fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2622fcdce8c4SStefano Zampini #endif 2623b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2624ccdfe979SStefano Zampini 2625ccdfe979SStefano Zampini PetscFunctionBegin; 2626ccdfe979SStefano Zampini MatCheckProduct(C, 1); 262728b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 26289566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 262928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2630fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2631fcdce8c4SStefano Zampini A = product->A; 2632fcdce8c4SStefano Zampini B = product->B; 2633fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2634fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2635fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 263608401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2637fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 263828b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2639fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 264028b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2641fcdce8c4SStefano Zampini goto finalize; 2642fcdce8c4SStefano Zampini } 2643fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 26449566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 264528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 26469566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 264728b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 264828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 264928b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2650fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2651fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2652fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 265308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 265408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 265508401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 26569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 26579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2658fcdce8c4SStefano Zampini 2659fcdce8c4SStefano Zampini ptype = product->type; 2660b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2661fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 266228b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2663fa046f9fSJunchao Zhang } 2664b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2665fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 266628b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2667fa046f9fSJunchao Zhang } 2668fcdce8c4SStefano Zampini switch (ptype) { 2669fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2670fcdce8c4SStefano Zampini Amat = Acusp->mat; 2671fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2672fcdce8c4SStefano Zampini break; 2673fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2674fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2675fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2676fcdce8c4SStefano Zampini break; 2677fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2678fcdce8c4SStefano Zampini Amat = Acusp->mat; 2679fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2680fcdce8c4SStefano Zampini break; 26819371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2682fcdce8c4SStefano Zampini } 2683fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 268428b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 268528b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 268628b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2687fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2688fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2689fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 269028b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 269128b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 269228b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 26939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2694fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2695fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 26969566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2697b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 26989371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 26999371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2700b4285af6SJunchao Zhang #else 27019371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 27029371c9d4SSatish Balay PetscCallCUSPARSE(stat); 27039371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 27049371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2705b4285af6SJunchao Zhang #endif 2706fcdce8c4SStefano Zampini #else 27079371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 27089371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 27099371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2710fcdce8c4SStefano Zampini #endif 27119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 27129566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 27139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2714fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2715fcdce8c4SStefano Zampini finalize: 2716fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 27179566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 27189566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 27199566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 2720fcdce8c4SStefano Zampini c->reallocs = 0; 2721fcdce8c4SStefano Zampini C->info.mallocs += 0; 2722fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2723fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2724fcdce8c4SStefano Zampini C->num_ass++; 2725ccdfe979SStefano Zampini PetscFunctionReturn(0); 2726ccdfe979SStefano Zampini } 2727fcdce8c4SStefano Zampini 27289371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) { 2729fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2730fcdce8c4SStefano Zampini Mat A, B; 2731fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2732fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 2733fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2734fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2735fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 2736fcdce8c4SStefano Zampini PetscBool flg; 2737fcdce8c4SStefano Zampini cusparseStatus_t stat; 2738fcdce8c4SStefano Zampini MatProductType ptype; 2739fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2740fcdce8c4SStefano Zampini PetscLogDouble flops; 2741fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 2742fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2743fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2744fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2745fcdce8c4SStefano Zampini #else 2746fcdce8c4SStefano Zampini int cnz; 2747fcdce8c4SStefano Zampini #endif 2748b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2749fcdce8c4SStefano Zampini 2750fcdce8c4SStefano Zampini PetscFunctionBegin; 2751fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 275228b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2753fcdce8c4SStefano Zampini A = product->A; 2754fcdce8c4SStefano Zampini B = product->B; 27559566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 275628b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 27579566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 275828b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2759fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 2760fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 2761fcdce8c4SStefano Zampini /* product data */ 27629566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2763fcdce8c4SStefano Zampini C->product->data = mmdata; 2764fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2765fcdce8c4SStefano Zampini 27669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 27679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2768d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 2769d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 277008401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 277108401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2772d60bce21SJunchao Zhang 2773fcdce8c4SStefano Zampini ptype = product->type; 2774b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2775fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2776fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 2777fa046f9fSJunchao Zhang } 2778b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2779fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 2780fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 2781fa046f9fSJunchao Zhang } 2782fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2783fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2784fcdce8c4SStefano Zampini switch (ptype) { 2785fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2786fcdce8c4SStefano Zampini m = A->rmap->n; 2787fcdce8c4SStefano Zampini n = B->cmap->n; 2788fcdce8c4SStefano Zampini k = A->cmap->n; 2789fcdce8c4SStefano Zampini Amat = Acusp->mat; 2790fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2791fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2792fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2793fcdce8c4SStefano Zampini break; 2794fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2795fcdce8c4SStefano Zampini m = A->cmap->n; 2796fcdce8c4SStefano Zampini n = B->cmap->n; 2797fcdce8c4SStefano Zampini k = A->rmap->n; 27989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2799fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2800fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2801fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2802fcdce8c4SStefano Zampini break; 2803fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2804fcdce8c4SStefano Zampini m = A->rmap->n; 2805fcdce8c4SStefano Zampini n = B->rmap->n; 2806fcdce8c4SStefano Zampini k = A->cmap->n; 28079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 2808fcdce8c4SStefano Zampini Amat = Acusp->mat; 2809fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2810fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2811fcdce8c4SStefano Zampini break; 28129371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2813fcdce8c4SStefano Zampini } 2814fcdce8c4SStefano Zampini 2815fcdce8c4SStefano Zampini /* create cusparse matrix */ 28169566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 28179566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 2818fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 2819fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2820fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2821fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2822fcdce8c4SStefano Zampini 2823fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2824fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2825fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 28269566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 28279566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 2828fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2829fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2830fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 2831fcdce8c4SStefano Zampini } else { 2832fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2833fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2834fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2835fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2836fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2837fcdce8c4SStefano Zampini } 2838fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2839fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2840fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2841fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2842fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2843fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 28449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 28459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 28469566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 28479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 28489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 28499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 28509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 28519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 28529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2853fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2854fcdce8c4SStefano Zampini thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0); 2855fcdce8c4SStefano Zampini c->nz = 0; 2856fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2857fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2858fcdce8c4SStefano Zampini goto finalizesym; 2859fcdce8c4SStefano Zampini } 2860fcdce8c4SStefano Zampini 286128b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 286228b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2863fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2864fcdce8c4SStefano Zampini if (!biscompressed) { 2865fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 2866fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2867fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2868fcdce8c4SStefano Zampini #endif 2869fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2870fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 2871fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2872fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2873fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2874fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2875fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2876fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2877fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2878fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2879fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 28809566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 2881fcdce8c4SStefano Zampini } 2882fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2883fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2884fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2885fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 28869371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 28879371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2888fcdce8c4SStefano Zampini } 2889fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2890fcdce8c4SStefano Zampini #endif 2891fcdce8c4SStefano Zampini } 289228b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 289328b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2894fcdce8c4SStefano Zampini /* precompute flops count */ 2895fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2896fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2897fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2898fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 2899fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 2900fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2901fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 2902fcdce8c4SStefano Zampini } 2903fcdce8c4SStefano Zampini } 2904fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2905fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 2906fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 2907fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 2908fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 2909fcdce8c4SStefano Zampini } 2910fcdce8c4SStefano Zampini } else { /* TODO */ 2911fcdce8c4SStefano Zampini flops = 0.; 2912fcdce8c4SStefano Zampini } 2913fcdce8c4SStefano Zampini 2914fcdce8c4SStefano Zampini mmdata->flops = flops; 29159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2916b4285af6SJunchao Zhang 2917fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 29199371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 29209371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29219566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 2922b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2923b4285af6SJunchao Zhang { 2924b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 2925b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 2926b4285af6SJunchao Zhang */ 2927b4285af6SJunchao Zhang void *dBuffer1 = NULL; 2928b4285af6SJunchao Zhang void *dBuffer2 = NULL; 2929b4285af6SJunchao Zhang void *dBuffer3 = NULL; 2930b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 2931b4285af6SJunchao Zhang size_t bufferSize1 = 0; 2932b4285af6SJunchao Zhang size_t bufferSize2 = 0; 2933b4285af6SJunchao Zhang size_t bufferSize3 = 0; 2934b4285af6SJunchao Zhang size_t bufferSize4 = 0; 2935b4285af6SJunchao Zhang size_t bufferSize5 = 0; 2936b4285af6SJunchao Zhang 2937b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2938b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 29399371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 29409371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 2942b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 29439371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 29449371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2945b4285af6SJunchao Zhang 2946b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 29479371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 29489371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 29509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 29519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 29529371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 29539371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 29559566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 2956b4285af6SJunchao Zhang 2957b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 2958b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 29599566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 2960b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 2961b4285af6SJunchao Zhang /* allocate matrix C */ 29629371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 29639371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 29649371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 29659371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2966b4285af6SJunchao Zhang /* update matC with the new pointers */ 29679371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 29689371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2969b4285af6SJunchao Zhang 2970b4285af6SJunchao Zhang /*----------------------------------------------------------------------*/ 29719371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 29729371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 29749371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 29759371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29769566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 29779371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29789371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29799566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 2980b4285af6SJunchao Zhang } 2981ae37ee31SJunchao Zhang #else 2982b4285af6SJunchao Zhang size_t bufSize2; 2983fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 29849371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 29859371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 2987fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 29889371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 29899371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2990fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 29919371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 29929371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2993fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2994fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2995fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2996fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2997fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 29989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 2999fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 30009371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 30019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3002fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 30039566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3004fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 30059371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 30069371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3007fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30089566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3009fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30109566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 30119371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 30129371c9d4SSatish Balay PetscCallCUSPARSE(stat); 30139371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 30149371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3015ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3016fcdce8c4SStefano Zampini #else 30179566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 30189371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30199371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 30209371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3021fcdce8c4SStefano Zampini c->nz = cnz; 3022fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 30239566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3024fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 30259566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3026fcdce8c4SStefano Zampini 30279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3028fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3029fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3030fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 30319371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30329371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 30339371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3034fcdce8c4SStefano Zampini #endif 30359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 30369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3037fcdce8c4SStefano Zampini finalizesym: 3038fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 3039fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 3040fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 30419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 30429566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 3043fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3044fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3045fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3046fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3047fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3048fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3049fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 30509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 30519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3052fcdce8c4SStefano Zampini } else { 3053fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3054fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 30559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 30569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3057fcdce8c4SStefano Zampini } 3058fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3059fcdce8c4SStefano Zampini PetscInt r = 0; 3060fcdce8c4SStefano Zampini c->i[0] = 0; 3061fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3062fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3063fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3064fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3065fcdce8c4SStefano Zampini } 3066fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3067fcdce8c4SStefano Zampini } 30689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 30699566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 30709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3071fcdce8c4SStefano Zampini c->maxnz = c->nz; 3072fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3073fcdce8c4SStefano Zampini c->rmax = 0; 3074fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3075fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3076fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3077fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 3078fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3079fcdce8c4SStefano Zampini } 30809566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 30819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3082fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3083fcdce8c4SStefano Zampini 3084fcdce8c4SStefano Zampini C->nonzerostate++; 30859566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 30869566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3087fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3088fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3089fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3090fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3091fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3092abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3093fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3094fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3095fcdce8c4SStefano Zampini } 3096fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3097fcdce8c4SStefano Zampini PetscFunctionReturn(0); 3098fcdce8c4SStefano Zampini } 3099fcdce8c4SStefano Zampini 3100fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3101fcdce8c4SStefano Zampini 3102fcdce8c4SStefano Zampini /* handles sparse or dense B */ 31039371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) { 3104fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3105fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3106fcdce8c4SStefano Zampini 3107fcdce8c4SStefano Zampini PetscFunctionBegin; 3108fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 31099566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3110*48a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3111fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3112fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 3113*48a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3114fcdce8c4SStefano Zampini } 311565e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 311665e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 311765e4b4d4SStefano Zampini switch (product->type) { 311865e4b4d4SStefano Zampini case MATPRODUCT_AB: 311965e4b4d4SStefano Zampini if (product->api_user) { 3120d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 31219566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3122d0609cedSBarry Smith PetscOptionsEnd(); 312365e4b4d4SStefano Zampini } else { 3124d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 31259566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3126d0609cedSBarry Smith PetscOptionsEnd(); 312765e4b4d4SStefano Zampini } 312865e4b4d4SStefano Zampini break; 312965e4b4d4SStefano Zampini case MATPRODUCT_AtB: 313065e4b4d4SStefano Zampini if (product->api_user) { 3131d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 31329566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3133d0609cedSBarry Smith PetscOptionsEnd(); 313465e4b4d4SStefano Zampini } else { 3135d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 31369566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3137d0609cedSBarry Smith PetscOptionsEnd(); 313865e4b4d4SStefano Zampini } 313965e4b4d4SStefano Zampini break; 314065e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 314165e4b4d4SStefano Zampini if (product->api_user) { 3142d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 31439566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3144d0609cedSBarry Smith PetscOptionsEnd(); 314565e4b4d4SStefano Zampini } else { 3146d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 31479566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3148d0609cedSBarry Smith PetscOptionsEnd(); 314965e4b4d4SStefano Zampini } 315065e4b4d4SStefano Zampini break; 315165e4b4d4SStefano Zampini case MATPRODUCT_RARt: 315265e4b4d4SStefano Zampini if (product->api_user) { 3153d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 31549566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3155d0609cedSBarry Smith PetscOptionsEnd(); 315665e4b4d4SStefano Zampini } else { 3157d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 31589566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3159d0609cedSBarry Smith PetscOptionsEnd(); 316065e4b4d4SStefano Zampini } 316165e4b4d4SStefano Zampini break; 316265e4b4d4SStefano Zampini case MATPRODUCT_ABC: 316365e4b4d4SStefano Zampini if (product->api_user) { 3164d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 31659566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3166d0609cedSBarry Smith PetscOptionsEnd(); 316765e4b4d4SStefano Zampini } else { 3168d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 31699566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3170d0609cedSBarry Smith PetscOptionsEnd(); 317165e4b4d4SStefano Zampini } 317265e4b4d4SStefano Zampini break; 31739371c9d4SSatish Balay default: break; 317465e4b4d4SStefano Zampini } 317565e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 317665e4b4d4SStefano Zampini } 317765e4b4d4SStefano Zampini /* dispatch */ 3178fcdce8c4SStefano Zampini if (isdense) { 3179ccdfe979SStefano Zampini switch (product->type) { 3180ccdfe979SStefano Zampini case MATPRODUCT_AB: 3181ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3182ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3183ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3184ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3185fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 31869566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3187fcdce8c4SStefano Zampini } else { 3188fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3189fcdce8c4SStefano Zampini } 3190fcdce8c4SStefano Zampini break; 31919371c9d4SSatish Balay case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 31929371c9d4SSatish Balay default: break; 3193ccdfe979SStefano Zampini } 3194fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3195fcdce8c4SStefano Zampini switch (product->type) { 3196fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3197fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 31989371c9d4SSatish Balay case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break; 3199fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3200fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 32019371c9d4SSatish Balay case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 32029371c9d4SSatish Balay default: break; 3203fcdce8c4SStefano Zampini } 3204fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 32059566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3206fcdce8c4SStefano Zampini } 3207ccdfe979SStefano Zampini PetscFunctionReturn(0); 3208ccdfe979SStefano Zampini } 3209ccdfe979SStefano Zampini 32109371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 32119ae82921SPaul Mullowney PetscFunctionBegin; 32129566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3213e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3214e6e9a74fSStefano Zampini } 3215e6e9a74fSStefano Zampini 32169371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3217e6e9a74fSStefano Zampini PetscFunctionBegin; 32189566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3219e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3220e6e9a74fSStefano Zampini } 3221e6e9a74fSStefano Zampini 32229371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3223e6e9a74fSStefano Zampini PetscFunctionBegin; 32249566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3225e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3226e6e9a74fSStefano Zampini } 3227e6e9a74fSStefano Zampini 32289371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3229e6e9a74fSStefano Zampini PetscFunctionBegin; 32309566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 32319ae82921SPaul Mullowney PetscFunctionReturn(0); 32329ae82921SPaul Mullowney } 32339ae82921SPaul Mullowney 32349371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 3235ca45077fSPaul Mullowney PetscFunctionBegin; 32369566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3237ca45077fSPaul Mullowney PetscFunctionReturn(0); 3238ca45077fSPaul Mullowney } 3239ca45077fSPaul Mullowney 32409371c9d4SSatish Balay __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) { 3241a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3242a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3243a0e72f99SJunchao Zhang } 3244a0e72f99SJunchao Zhang 3245afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 32469371c9d4SSatish Balay static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) { 32479ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3248aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 32499ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3250e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3251e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3252e6e9a74fSStefano Zampini PetscBool compressed; 3253afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3254afb2bd1cSJunchao Zhang PetscInt nx, ny; 3255afb2bd1cSJunchao Zhang #endif 32566e111a19SKarl Rupp 32579ae82921SPaul Mullowney PetscFunctionBegin; 325808401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3259cbc6b225SStefano Zampini if (!a->nz) { 32609566063dSJacob Faibussowitsch if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0)); 32619566063dSJacob Faibussowitsch else PetscCall(VecCopy_SeqCUDA(yy, zz)); 3262e6e9a74fSStefano Zampini PetscFunctionReturn(0); 3263e6e9a74fSStefano Zampini } 326434d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 32659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3266e6e9a74fSStefano Zampini if (!trans) { 32679ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 32685f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3269e6e9a74fSStefano Zampini } else { 32701a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3271e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3272e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3273e6e9a74fSStefano Zampini } else { 32749566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3275e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3276e6e9a74fSStefano Zampini } 3277e6e9a74fSStefano Zampini } 3278e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3279e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3280213423ffSJunchao Zhang 3281e6e9a74fSStefano Zampini try { 32829566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 32839566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */ 32849566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3285afb2bd1cSJunchao Zhang 32869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3287e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3288afb2bd1cSJunchao Zhang /* z = A x + beta y. 3289afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3290afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3291afb2bd1cSJunchao Zhang */ 3292e6e9a74fSStefano Zampini xptr = xarray; 3293afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3294213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3295afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3296afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3297afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3298afb2bd1cSJunchao Zhang */ 3299afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3300afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3301afb2bd1cSJunchao Zhang nx = mat->num_cols; 3302afb2bd1cSJunchao Zhang ny = mat->num_rows; 3303afb2bd1cSJunchao Zhang } 3304afb2bd1cSJunchao Zhang #endif 3305e6e9a74fSStefano Zampini } else { 3306afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3307afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3308afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3309afb2bd1cSJunchao Zhang */ 3310afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3311e6e9a74fSStefano Zampini dptr = zarray; 3312e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3313afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3314e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3315a0e72f99SJunchao Zhang thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 33169371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3317e6e9a74fSStefano Zampini } 3318afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3319afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3320afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3321afb2bd1cSJunchao Zhang nx = mat->num_rows; 3322afb2bd1cSJunchao Zhang ny = mat->num_cols; 3323afb2bd1cSJunchao Zhang } 3324afb2bd1cSJunchao Zhang #endif 3325e6e9a74fSStefano Zampini } 33269ae82921SPaul Mullowney 3327afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3328aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3329afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 33305f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3331afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 33329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 33339566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 33349371c9d4SSatish Balay PetscCallCUSPARSE( 33359371c9d4SSatish Balay cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 33369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3337afb2bd1cSJunchao Zhang 3338afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3339afb2bd1cSJunchao Zhang } else { 3340afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 33419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 33429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3343afb2bd1cSJunchao Zhang } 3344afb2bd1cSJunchao Zhang 33459371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */ 33469371c9d4SSatish Balay matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3347afb2bd1cSJunchao Zhang #else 33487656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 33499371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3350afb2bd1cSJunchao Zhang #endif 3351aa372e3fSPaul Mullowney } else { 3352213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3353afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3354afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3355afb2bd1cSJunchao Zhang #else 3356301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 33579371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3358afb2bd1cSJunchao Zhang #endif 3359a65300a6SPaul Mullowney } 3360aa372e3fSPaul Mullowney } 33619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3362aa372e3fSPaul Mullowney 3363e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3364213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3365213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 33669566063dSJacob Faibussowitsch PetscCall(VecCopy_SeqCUDA(yy, zz)); /* zz = yy */ 3367e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 33689566063dSJacob Faibussowitsch PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ 33697656d835SStefano Zampini } 3370213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 33719566063dSJacob Faibussowitsch PetscCall(VecSet_SeqCUDA(zz, 0)); 33727656d835SStefano Zampini } 33737656d835SStefano Zampini 3374213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3375213423ffSJunchao Zhang if (compressed) { 33769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3377a0e72f99SJunchao Zhang /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred) 3378a0e72f99SJunchao Zhang and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to 3379a0e72f99SJunchao Zhang prevent that. So I just add a ScatterAdd kernel. 3380a0e72f99SJunchao Zhang */ 3381a0e72f99SJunchao Zhang #if 0 3382a0e72f99SJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 3383a0e72f99SJunchao Zhang thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream), 3384a0e72f99SJunchao Zhang thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 3385e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 3386c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 3387a0e72f99SJunchao Zhang #else 3388a0e72f99SJunchao Zhang PetscInt n = matstruct->cprowIndices->size(); 3389a0e72f99SJunchao Zhang ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3390a0e72f99SJunchao Zhang #endif 33919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3392e6e9a74fSStefano Zampini } 3393e6e9a74fSStefano Zampini } else { 33949371c9d4SSatish Balay if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ } 3395e6e9a74fSStefano Zampini } 33969566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 33979566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 33989566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 33999371c9d4SSatish Balay } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); } 3400e6e9a74fSStefano Zampini if (yy) { 34019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3402e6e9a74fSStefano Zampini } else { 34039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3404e6e9a74fSStefano Zampini } 34059ae82921SPaul Mullowney PetscFunctionReturn(0); 34069ae82921SPaul Mullowney } 34079ae82921SPaul Mullowney 34089371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 3409ca45077fSPaul Mullowney PetscFunctionBegin; 34109566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3411ca45077fSPaul Mullowney PetscFunctionReturn(0); 3412ca45077fSPaul Mullowney } 3413ca45077fSPaul Mullowney 34149371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) { 3415042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 3416042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 34173fa6b06aSMark Adams 3418042217e8SBarry Smith PetscFunctionBegin; 34199566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3420042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 34219566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 34229566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->deviceMat)); 3423042217e8SBarry Smith cusp->deviceMat = NULL; 3424042217e8SBarry Smith } 34259ae82921SPaul Mullowney PetscFunctionReturn(0); 34269ae82921SPaul Mullowney } 34279ae82921SPaul Mullowney 34289ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 3429e057df02SPaul Mullowney /*@ 34309ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 3431e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3432e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 3433e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 3434e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 3435e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 34369ae82921SPaul Mullowney 3437d083f849SBarry Smith Collective 34389ae82921SPaul Mullowney 34399ae82921SPaul Mullowney Input Parameters: 34409ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 34419ae82921SPaul Mullowney . m - number of rows 34429ae82921SPaul Mullowney . n - number of columns 34439ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 34449ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 34450298fd71SBarry Smith (possibly different for each row) or NULL 34469ae82921SPaul Mullowney 34479ae82921SPaul Mullowney Output Parameter: 34489ae82921SPaul Mullowney . A - the matrix 34499ae82921SPaul Mullowney 34509ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 34519ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 34529ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 34539ae82921SPaul Mullowney 34549ae82921SPaul Mullowney Notes: 34559ae82921SPaul Mullowney If nnz is given then nz is ignored 34569ae82921SPaul Mullowney 34579ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 34589ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 34599ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 34609ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 34619ae82921SPaul Mullowney 34629ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 34630298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 34649ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 34659ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 34669ae82921SPaul Mullowney 34679ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 34689ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 34699ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 34709ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 34719ae82921SPaul Mullowney 34729ae82921SPaul Mullowney Level: intermediate 34739ae82921SPaul Mullowney 3474db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE` 34759ae82921SPaul Mullowney @*/ 34769371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 34779ae82921SPaul Mullowney PetscFunctionBegin; 34789566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 34799566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 34809566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 34819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 34829ae82921SPaul Mullowney PetscFunctionReturn(0); 34839ae82921SPaul Mullowney } 34849ae82921SPaul Mullowney 34859371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) { 34869ae82921SPaul Mullowney PetscFunctionBegin; 34879ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 34889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr)); 34899ae82921SPaul Mullowney } else { 34909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3491aa372e3fSPaul Mullowney } 34929566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 34939566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 34949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 34959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 34969566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 34979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 34989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 34999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 35009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 35019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 35029566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 35039ae82921SPaul Mullowney PetscFunctionReturn(0); 35049ae82921SPaul Mullowney } 35059ae82921SPaul Mullowney 3506ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 350795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 35089371c9d4SSatish Balay static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) { 35099ff858a8SKarl Rupp PetscFunctionBegin; 35109566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 35119566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 35129ff858a8SKarl Rupp PetscFunctionReturn(0); 35139ff858a8SKarl Rupp } 35149ff858a8SKarl Rupp 35159371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) { 3516a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3517039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3518039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3519039c6fbaSStefano Zampini PetscScalar *ay; 3520039c6fbaSStefano Zampini const PetscScalar *ax; 3521039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3522e6e9a74fSStefano Zampini 352395639643SRichard Tran Mills PetscFunctionBegin; 3524a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3525a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3526039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 35279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 35289566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3529a587d139SMark PetscFunctionReturn(0); 353095639643SRichard Tran Mills } 3531039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 35329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 35339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 35345f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 35355f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3536039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3537039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3538039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3539039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3540039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 35419371c9d4SSatish Balay if (eq) { eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); } 3542039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3543039c6fbaSStefano Zampini } 3544d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3545d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3546039c6fbaSStefano Zampini 3547039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3548039c6fbaSStefano Zampini PetscScalar b = 1.0; 3549039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3550039c6fbaSStefano Zampini size_t bufferSize; 3551039c6fbaSStefano Zampini void *buffer; 3552039c6fbaSStefano Zampini #endif 3553039c6fbaSStefano Zampini 35549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 35559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 35569566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3557039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 35589371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35599371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 35609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 35619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35629371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35639371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 35649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 35659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 35669566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3567039c6fbaSStefano Zampini #else 35689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35699371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 35709371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 35719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 35729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3573039c6fbaSStefano Zampini #endif 35749566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 35759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 35769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 35779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3578039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3579a587d139SMark cublasHandle_t cublasv2handle; 3580a587d139SMark PetscBLASInt one = 1, bnz = 1; 3581039c6fbaSStefano Zampini 35829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 35839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 35849566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 35859566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 35869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 35879566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 35889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 35899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 35909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 35919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 35929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3593039c6fbaSStefano Zampini } else { 35949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 35959566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3596a587d139SMark } 359795639643SRichard Tran Mills PetscFunctionReturn(0); 359895639643SRichard Tran Mills } 359995639643SRichard Tran Mills 36009371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) { 360133c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 360233c9ba73SStefano Zampini PetscScalar *ay; 360333c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 360433c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 360533c9ba73SStefano Zampini 360633c9ba73SStefano Zampini PetscFunctionBegin; 36079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 36089566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 36099566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 36109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 36119566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 36129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 36139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 36149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 36159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 361633c9ba73SStefano Zampini PetscFunctionReturn(0); 361733c9ba73SStefano Zampini } 361833c9ba73SStefano Zampini 36199371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) { 36207e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3621a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 36227e8381f9SStefano Zampini 36233fa6b06aSMark Adams PetscFunctionBegin; 36243fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 36253fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 36267e8381f9SStefano Zampini if (spptr->mat) { 36277e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 36287e8381f9SStefano Zampini if (matrix->values) { 36297e8381f9SStefano Zampini both = PETSC_TRUE; 36307e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 36317e8381f9SStefano Zampini } 36327e8381f9SStefano Zampini } 36337e8381f9SStefano Zampini if (spptr->matTranspose) { 36347e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 36359371c9d4SSatish Balay if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); } 36367e8381f9SStefano Zampini } 36373fa6b06aSMark Adams } 36389566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 36399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 36407e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3641a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 36423fa6b06aSMark Adams PetscFunctionReturn(0); 36433fa6b06aSMark Adams } 36443fa6b06aSMark Adams 36459371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) { 3646a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3647a587d139SMark 3648a587d139SMark PetscFunctionBegin; 36499a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 36509a14fc28SStefano Zampini A->boundtocpu = flg; 36519a14fc28SStefano Zampini PetscFunctionReturn(0); 36529a14fc28SStefano Zampini } 3653a587d139SMark if (flg) { 36549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3655a587d139SMark 365633c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3657a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3658a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3659a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3660a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3661a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3662a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3663a587d139SMark A->ops->multhermitiantranspose = NULL; 3664a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3665fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 36669566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 36679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 36689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 36699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 36709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 36719566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 36729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3673a587d139SMark } else { 367433c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3675a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3676a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3677a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3678a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3679a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3680a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3681a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3682a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3683fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 368467a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 368567a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 368667a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 368767a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 368867a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 368967a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 36907ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 36917ee59b9bSJunchao Zhang 36929566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 36939566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 36949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 36959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 36969566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 36979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 3698a587d139SMark } 3699a587d139SMark A->boundtocpu = flg; 3700ea500dcfSRichard Tran Mills if (flg && a->inode.size) { 3701ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 3702ea500dcfSRichard Tran Mills } else { 3703ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 3704ea500dcfSRichard Tran Mills } 3705a587d139SMark PetscFunctionReturn(0); 3706a587d139SMark } 3707a587d139SMark 37089371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 370949735bf3SStefano Zampini Mat B; 37109ae82921SPaul Mullowney 37119ae82921SPaul Mullowney PetscFunctionBegin; 37129566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 371349735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 37149566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 371549735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 37169566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 371749735bf3SStefano Zampini } 371849735bf3SStefano Zampini B = *newmat; 371949735bf3SStefano Zampini 37209566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 37219566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 372234136279SStefano Zampini 372349735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 37249ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3725e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 37269566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 37279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 37289566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 37291a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 3730d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3731ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301 3732a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 3733a435da06SStefano Zampini #else 3734d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 3735a435da06SStefano Zampini #endif 3736d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 3737d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 3738d8132acaSStefano Zampini #endif 37391a2c6b5cSJunchao Zhang B->spptr = spptr; 37409ae82921SPaul Mullowney } else { 3741e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3742e6e9a74fSStefano Zampini 37439566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 37449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 37459566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 3746e6e9a74fSStefano Zampini B->spptr = spptr; 37479ae82921SPaul Mullowney } 3748e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 374949735bf3SStefano Zampini } 3750693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 37519ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 37521a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 37539ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 375495639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3755693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 37562205254eSKarl Rupp 37579566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 37589566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 37599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 3760ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 37619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 3762ae48a8d0SStefano Zampini #endif 37639566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 37649ae82921SPaul Mullowney PetscFunctionReturn(0); 37659ae82921SPaul Mullowney } 37669ae82921SPaul Mullowney 37679371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) { 376802fe1965SBarry Smith PetscFunctionBegin; 37699566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 37709566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 377102fe1965SBarry Smith PetscFunctionReturn(0); 377202fe1965SBarry Smith } 377302fe1965SBarry Smith 37743ca39a21SBarry Smith /*MC 3775e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3776e057df02SPaul Mullowney 3777e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 37782692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 37792692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3780e057df02SPaul Mullowney 3781e057df02SPaul Mullowney Options Database Keys: 3782e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3783aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3784a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3785365b711fSMark Adams + -mat_cusparse_use_cpu_solve - Do MatSolve on CPU 3786e057df02SPaul Mullowney 3787e057df02SPaul Mullowney Level: beginner 3788e057df02SPaul Mullowney 3789db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 3790e057df02SPaul Mullowney M*/ 37917f756511SDominic Meiser 3792bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *); 37930f39cd5aSBarry Smith 37949371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) { 379542c9c57cSBarry Smith PetscFunctionBegin; 37969566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band)); 37979566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 37989566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 37999566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 38009566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 3801bddcd29dSMark Adams 380242c9c57cSBarry Smith PetscFunctionReturn(0); 380342c9c57cSBarry Smith } 380429b38603SBarry Smith 38059371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) { 3806cbc6b225SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr; 3807cbc6b225SStefano Zampini 3808cbc6b225SStefano Zampini PetscFunctionBegin; 3809cbc6b225SStefano Zampini if (!cusp) PetscFunctionReturn(0); 3810cbc6b225SStefano Zampini delete cusp->cooPerm; 3811cbc6b225SStefano Zampini delete cusp->cooPerm_a; 3812cbc6b225SStefano Zampini cusp->cooPerm = NULL; 3813cbc6b225SStefano Zampini cusp->cooPerm_a = NULL; 3814cbc6b225SStefano Zampini if (cusp->use_extended_coo) { 38159566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->jmap_d)); 38169566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusp->perm_d)); 3817cbc6b225SStefano Zampini } 3818cbc6b225SStefano Zampini cusp->use_extended_coo = PETSC_FALSE; 3819cbc6b225SStefano Zampini PetscFunctionReturn(0); 3820cbc6b225SStefano Zampini } 3821cbc6b225SStefano Zampini 38229371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) { 38237f756511SDominic Meiser PetscFunctionBegin; 38247f756511SDominic Meiser if (*cusparsestruct) { 38259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format)); 38269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format)); 38277f756511SDominic Meiser delete (*cusparsestruct)->workVector; 382881902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 38297e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 38307e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 3831a49f1ed0SStefano Zampini delete (*cusparsestruct)->csr2csc_i; 38329566063dSJacob Faibussowitsch if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle)); 38339566063dSJacob Faibussowitsch if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d)); 38349566063dSJacob Faibussowitsch if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d)); 38359566063dSJacob Faibussowitsch PetscCall(PetscFree(*cusparsestruct)); 38367f756511SDominic Meiser } 38377f756511SDominic Meiser PetscFunctionReturn(0); 38387f756511SDominic Meiser } 38397f756511SDominic Meiser 38409371c9d4SSatish Balay static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) { 38417f756511SDominic Meiser PetscFunctionBegin; 38427f756511SDominic Meiser if (*mat) { 38437f756511SDominic Meiser delete (*mat)->values; 38447f756511SDominic Meiser delete (*mat)->column_indices; 38457f756511SDominic Meiser delete (*mat)->row_offsets; 38467f756511SDominic Meiser delete *mat; 38477f756511SDominic Meiser *mat = 0; 38487f756511SDominic Meiser } 38497f756511SDominic Meiser PetscFunctionReturn(0); 38507f756511SDominic Meiser } 38517f756511SDominic Meiser 38529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) { 38537f756511SDominic Meiser PetscFunctionBegin; 38547f756511SDominic Meiser if (*trifactor) { 38559566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 3856261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 38579566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 38589566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 38599566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 3860afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 38619566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 3862afb2bd1cSJunchao Zhang #endif 38639566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 38647f756511SDominic Meiser } 38657f756511SDominic Meiser PetscFunctionReturn(0); 38667f756511SDominic Meiser } 38677f756511SDominic Meiser 38689371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) { 38697f756511SDominic Meiser CsrMatrix *mat; 38707f756511SDominic Meiser 38717f756511SDominic Meiser PetscFunctionBegin; 38727f756511SDominic Meiser if (*matstruct) { 38737f756511SDominic Meiser if ((*matstruct)->mat) { 38747f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 3875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3876afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3877afb2bd1cSJunchao Zhang #else 38787f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 38799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 3880afb2bd1cSJunchao Zhang #endif 38817f756511SDominic Meiser } else { 38827f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 38837f756511SDominic Meiser CsrMatrix_Destroy(&mat); 38847f756511SDominic Meiser } 38857f756511SDominic Meiser } 38869566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 38877f756511SDominic Meiser delete (*matstruct)->cprowIndices; 38889566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 38899566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 38909566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 3891afb2bd1cSJunchao Zhang 3892afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3893afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 38949566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 3895afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 3896afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 38979566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 38989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 38999566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 3900afb2bd1cSJunchao Zhang } 3901afb2bd1cSJunchao Zhang } 3902afb2bd1cSJunchao Zhang #endif 39037f756511SDominic Meiser delete *matstruct; 39047e8381f9SStefano Zampini *matstruct = NULL; 39057f756511SDominic Meiser } 39067f756511SDominic Meiser PetscFunctionReturn(0); 39077f756511SDominic Meiser } 39087f756511SDominic Meiser 39099371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) { 3910da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 3911da112707SJunchao Zhang 39127f756511SDominic Meiser PetscFunctionBegin; 3913da112707SJunchao Zhang if (fs) { 3914da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 3915da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 3916da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 3917da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 3918da112707SJunchao Zhang delete fs->rpermIndices; 3919da112707SJunchao Zhang delete fs->cpermIndices; 3920da112707SJunchao Zhang delete fs->workVector; 3921da112707SJunchao Zhang fs->rpermIndices = NULL; 3922da112707SJunchao Zhang fs->cpermIndices = NULL; 3923da112707SJunchao Zhang fs->workVector = NULL; 3924da112707SJunchao Zhang if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d)); 3925da112707SJunchao Zhang if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d)); 3926da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 3927da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500 3928da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 3929da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 3930da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 3931da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 3932da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 393312ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 3934da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 3935da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 393612ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 3937da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 3938da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 3939da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 3940da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 3941da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 3942da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 3943da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 3944da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 3945da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 3946da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 3947da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 3948da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 394912ba2bc6SJunchao Zhang 395012ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 395112ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 3952da112707SJunchao Zhang #endif 3953ccdfe979SStefano Zampini } 3954ccdfe979SStefano Zampini PetscFunctionReturn(0); 3955ccdfe979SStefano Zampini } 3956ccdfe979SStefano Zampini 39579371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) { 3958ccdfe979SStefano Zampini cusparseHandle_t handle; 3959ccdfe979SStefano Zampini 3960ccdfe979SStefano Zampini PetscFunctionBegin; 3961ccdfe979SStefano Zampini if (*trifactors) { 39629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 3963*48a46eb9SPierre Jolivet if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle)); 39649566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 39657f756511SDominic Meiser } 39667f756511SDominic Meiser PetscFunctionReturn(0); 39677f756511SDominic Meiser } 39687e8381f9SStefano Zampini 39699371c9d4SSatish Balay struct IJCompare { 39709371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 39717e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 39727e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 39737e8381f9SStefano Zampini return false; 39747e8381f9SStefano Zampini } 39757e8381f9SStefano Zampini }; 39767e8381f9SStefano Zampini 39779371c9d4SSatish Balay struct IJEqual { 39789371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) { 39797e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 39807e8381f9SStefano Zampini return true; 39817e8381f9SStefano Zampini } 39827e8381f9SStefano Zampini }; 39837e8381f9SStefano Zampini 39849371c9d4SSatish Balay struct IJDiff { 39859371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; } 39867e8381f9SStefano Zampini }; 39877e8381f9SStefano Zampini 39889371c9d4SSatish Balay struct IJSum { 39899371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; } 39907e8381f9SStefano Zampini }; 39917e8381f9SStefano Zampini 39927e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3993219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */ 39949371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 39957e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 3996fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3997bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_v = NULL; 399808391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 39997e8381f9SStefano Zampini CsrMatrix *matrix; 40007e8381f9SStefano Zampini PetscInt n; 40017e8381f9SStefano Zampini 40027e8381f9SStefano Zampini PetscFunctionBegin; 400328b400f6SJacob Faibussowitsch PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct"); 400428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix"); 40057e8381f9SStefano Zampini if (!cusp->cooPerm) { 40069566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 40079566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 40087e8381f9SStefano Zampini PetscFunctionReturn(0); 40097e8381f9SStefano Zampini } 40107e8381f9SStefano Zampini matrix = (CsrMatrix *)cusp->mat->mat; 401128b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4012e61fc153SStefano Zampini if (!v) { 4013e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 4014e61fc153SStefano Zampini goto finalize; 40157e8381f9SStefano Zampini } 4016e61fc153SStefano Zampini n = cusp->cooPerm->size(); 401708391a17SStefano Zampini if (isCudaMem(v)) { 401808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 401908391a17SStefano Zampini } else { 4020e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 4021e61fc153SStefano Zampini cooPerm_v->assign(v, v + n); 402208391a17SStefano Zampini d_v = cooPerm_v->data(); 40239566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 402408391a17SStefano Zampini } 40259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4026e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 4027ddea5d60SJunchao Zhang if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */ 4028bfcc3627SStefano Zampini THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size()); 402908391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4030ddea5d60SJunchao Zhang /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output) 4031ddea5d60SJunchao Zhang cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[]. 4032ddea5d60SJunchao Zhang cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero. 4033ddea5d60SJunchao Zhang */ 4034e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 4035e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>()); 4036e61fc153SStefano Zampini delete cooPerm_w; 40377e8381f9SStefano Zampini } else { 4038ddea5d60SJunchao Zhang /* all nonzeros in d_v[] are unique entries */ 40399371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 40409371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 4041ddea5d60SJunchao Zhang thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]] */ 40427e8381f9SStefano Zampini } 40437e8381f9SStefano Zampini } else { 4044e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 404508391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()); 4046e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>()); 40477e8381f9SStefano Zampini } else { 40489371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin())); 40499371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end())); 40507e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 40517e8381f9SStefano Zampini } 40527e8381f9SStefano Zampini } 40539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4054e61fc153SStefano Zampini finalize: 4055e61fc153SStefano Zampini delete cooPerm_v; 40567e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 40579566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4058fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 40599566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz)); 40609566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n")); 40619566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax)); 4062fcdce8c4SStefano Zampini a->reallocs = 0; 4063fcdce8c4SStefano Zampini A->info.mallocs += 0; 4064fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 4065fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 4066fcdce8c4SStefano Zampini A->num_ass++; 40677e8381f9SStefano Zampini PetscFunctionReturn(0); 40687e8381f9SStefano Zampini } 40697e8381f9SStefano Zampini 40709371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) { 4071a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4072a49f1ed0SStefano Zampini 4073a49f1ed0SStefano Zampini PetscFunctionBegin; 4074a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4075a49f1ed0SStefano Zampini if (!cusp) PetscFunctionReturn(0); 4076a49f1ed0SStefano Zampini if (destroy) { 40779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4078a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4079a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4080a49f1ed0SStefano Zampini } 40811a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 4082a49f1ed0SStefano Zampini PetscFunctionReturn(0); 4083a49f1ed0SStefano Zampini } 4084a49f1ed0SStefano Zampini 40857e8381f9SStefano Zampini #include <thrust/binary_search.h> 4086219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */ 40879371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 40887e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 40897e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 40907e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 40917e8381f9SStefano Zampini 40927e8381f9SStefano Zampini PetscFunctionBegin; 40939566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->rmap)); 40949566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(A->cmap)); 40957e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 40967e8381f9SStefano Zampini if (n != cooPerm_n) { 40977e8381f9SStefano Zampini delete cusp->cooPerm; 40987e8381f9SStefano Zampini delete cusp->cooPerm_a; 40997e8381f9SStefano Zampini cusp->cooPerm = NULL; 41007e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 41017e8381f9SStefano Zampini } 41027e8381f9SStefano Zampini if (n) { 4103e8729f6fSJunchao Zhang thrust::device_ptr<PetscInt> d_i, d_j; 4104e8729f6fSJunchao Zhang PetscInt *d_raw_i, *d_raw_j; 4105e8729f6fSJunchao Zhang PetscBool free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE; 4106e8729f6fSJunchao Zhang PetscMemType imtype, jmtype; 4107e8729f6fSJunchao Zhang 4108e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_i, &imtype)); 4109e8729f6fSJunchao Zhang if (PetscMemTypeHost(imtype)) { 4110e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n)); 4111e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4112e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(d_raw_i); 4113e8729f6fSJunchao Zhang free_raw_i = PETSC_TRUE; 4114e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4115e8729f6fSJunchao Zhang } else { 4116e8729f6fSJunchao Zhang d_i = thrust::device_pointer_cast(coo_i); 4117e8729f6fSJunchao Zhang } 4118e8729f6fSJunchao Zhang 4119e8729f6fSJunchao Zhang PetscCall(PetscGetMemType(coo_j, &jmtype)); 4120e8729f6fSJunchao Zhang if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]! 4121e8729f6fSJunchao Zhang PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n)); 4122e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice)); 4123e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(d_raw_j); 4124e8729f6fSJunchao Zhang free_raw_j = PETSC_TRUE; 4125e8729f6fSJunchao Zhang PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt))); 4126e8729f6fSJunchao Zhang } else { 4127e8729f6fSJunchao Zhang d_j = thrust::device_pointer_cast(coo_j); 4128e8729f6fSJunchao Zhang } 4129e8729f6fSJunchao Zhang 41307e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 41317e8381f9SStefano Zampini 41327e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 41337e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 41347e8381f9SStefano Zampini 4135ddea5d60SJunchao Zhang /* Ex. 4136ddea5d60SJunchao Zhang n = 6 4137ddea5d60SJunchao Zhang coo_i = [3,3,1,4,1,4] 4138ddea5d60SJunchao Zhang coo_j = [3,2,2,5,2,6] 4139ddea5d60SJunchao Zhang */ 4140e8729f6fSJunchao Zhang auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j)); 4141e8729f6fSJunchao Zhang auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n)); 41427e8381f9SStefano Zampini 41439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 41447e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 4145ddea5d60SJunchao Zhang thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */ 4146e8729f6fSJunchao Zhang (*cusp->cooPerm_a).assign(d_i, d_i + n); /* copy the sorted array */ 4147e8729f6fSJunchao Zhang THRUSTINTARRAY w(d_j, d_j + n); 41487e8381f9SStefano Zampini 4149ddea5d60SJunchao Zhang /* 4150ddea5d60SJunchao Zhang d_i = [1,1,3,3,4,4] 4151ddea5d60SJunchao Zhang d_j = [2,2,2,3,5,6] 4152ddea5d60SJunchao Zhang cooPerm = [2,4,1,0,3,5] 4153ddea5d60SJunchao Zhang */ 4154ddea5d60SJunchao Zhang auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */ 4155ddea5d60SJunchao Zhang 4156ddea5d60SJunchao Zhang /* 4157ddea5d60SJunchao Zhang d_i = [1,3,3,4,4,x] 4158ddea5d60SJunchao Zhang ^ekey 4159ddea5d60SJunchao Zhang d_j = [2,2,3,5,6,x] 4160ddea5d60SJunchao Zhang ^nekye 4161ddea5d60SJunchao Zhang */ 41627e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 41637e8381f9SStefano Zampini delete cusp->cooPerm_a; 41647e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 4165ddea5d60SJunchao Zhang } else { /* Stefano: I couldn't come up with a more elegant algorithm */ 4166ddea5d60SJunchao Zhang /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */ 4167ddea5d60SJunchao Zhang adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/ 4168ddea5d60SJunchao Zhang adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff()); /* w: [2,2,2,3,5,6] => [2,0,0,1,1,1]*/ 4169ddea5d60SJunchao Zhang (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */ 41707e8381f9SStefano Zampini w[0] = 0; 4171ddea5d60SJunchao Zhang thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum()); /* cooPerm_a = [0,0,1,1,1,1]*/ 4172ddea5d60SJunchao Zhang thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/ 41737e8381f9SStefano Zampini } 41747e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 4175e8729f6fSJunchao Zhang thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */ 4176ddea5d60SJunchao Zhang search_begin, search_begin + A->rmap->n, /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */ 4177ddea5d60SJunchao Zhang ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */ 41789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 41797e8381f9SStefano Zampini 41809566063dSJacob Faibussowitsch PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i)); 41817e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 41827e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 41837e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 41849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i)); 4185ddea5d60SJunchao Zhang a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */ 41869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 41877e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 4188fcdce8c4SStefano Zampini a->rmax = 0; 41899566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->a)); 41909566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(a->nz, &a->j)); 4191e8729f6fSJunchao Zhang PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 41929566063dSJacob Faibussowitsch if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen)); 41939566063dSJacob Faibussowitsch if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax)); 41947e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 41957e8381f9SStefano Zampini const PetscInt nnzr = a->i[i + 1] - a->i[i]; 41967e8381f9SStefano Zampini nzr += (PetscInt) !!(nnzr); 41977e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 4198fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax, nnzr); 41997e8381f9SStefano Zampini } 4200fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 42017e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 42029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt))); 42039566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(A)); 4204e8729f6fSJunchao Zhang if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i)); 4205e8729f6fSJunchao Zhang if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j)); 42067e8381f9SStefano Zampini } else { 42079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL)); 42087e8381f9SStefano Zampini } 42099566063dSJacob Faibussowitsch PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE)); 42107e8381f9SStefano Zampini 42117e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 4212e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 42139566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->nz)); 42149566063dSJacob Faibussowitsch PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6)); 42157e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 42169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 42179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 42187e8381f9SStefano Zampini PetscFunctionReturn(0); 42197e8381f9SStefano Zampini } 4220ed502f03SStefano Zampini 42219371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 4222219fbbafSJunchao Zhang Mat_SeqAIJ *seq; 4223219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev; 4224cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 4225219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 4226219fbbafSJunchao Zhang 4227219fbbafSJunchao Zhang PetscFunctionBegin; 42289566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJ(mat)); 42299566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat)); 4230219fbbafSJunchao Zhang if (coo_i) { 42319566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 4232219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 4233219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { 42349371c9d4SSatish Balay if (coo_i[k] < 0 || coo_j[k] < 0) { 42359371c9d4SSatish Balay coo_basic = PETSC_FALSE; 42369371c9d4SSatish Balay break; 42379371c9d4SSatish Balay } 4238219fbbafSJunchao Zhang } 4239219fbbafSJunchao Zhang } 4240219fbbafSJunchao Zhang } 4241219fbbafSJunchao Zhang 4242219fbbafSJunchao Zhang if (coo_basic) { /* i,j are on device or do not contain negative indices */ 42439566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 4244219fbbafSJunchao Zhang } else { 42459566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 4246cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 42479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4248219fbbafSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(mat->data); 4249219fbbafSJunchao Zhang dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 42509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount))); 42519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 42529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount))); 42539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4254219fbbafSJunchao Zhang dev->use_extended_coo = PETSC_TRUE; 4255219fbbafSJunchao Zhang } 4256219fbbafSJunchao Zhang PetscFunctionReturn(0); 4257219fbbafSJunchao Zhang } 4258219fbbafSJunchao Zhang 42599371c9d4SSatish Balay __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) { 4260219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4261219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4262b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4263b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4264b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4265b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4266b6c38306SJunchao Zhang } 4267219fbbafSJunchao Zhang } 4268219fbbafSJunchao Zhang 42699371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) { 4270219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4271219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4272219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4273219fbbafSJunchao Zhang PetscMemType memtype; 4274219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4275219fbbafSJunchao Zhang PetscScalar *Aa; 4276219fbbafSJunchao Zhang 4277219fbbafSJunchao Zhang PetscFunctionBegin; 4278219fbbafSJunchao Zhang if (dev->use_extended_coo) { 42799566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4280219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 42819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar))); 42829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4283219fbbafSJunchao Zhang } 4284219fbbafSJunchao Zhang 42859566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 42869566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4287219fbbafSJunchao Zhang 4288cbc6b225SStefano Zampini if (Annz) { 4289b6c38306SJunchao Zhang MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa); 42909566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4291cbc6b225SStefano Zampini } 4292219fbbafSJunchao Zhang 42939566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 42949566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4295219fbbafSJunchao Zhang 42969566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4297219fbbafSJunchao Zhang } else { 42989566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode)); 4299219fbbafSJunchao Zhang } 4300219fbbafSJunchao Zhang PetscFunctionReturn(0); 4301219fbbafSJunchao Zhang } 4302219fbbafSJunchao Zhang 43035b7e41feSStefano Zampini /*@C 43045b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices. 43055b7e41feSStefano Zampini 43065b7e41feSStefano Zampini Not collective 43075b7e41feSStefano Zampini 43085b7e41feSStefano Zampini Input Parameters: 43095b7e41feSStefano Zampini + A - the matrix 43105b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 43115b7e41feSStefano Zampini 43125b7e41feSStefano Zampini Output Parameters: 43135b7e41feSStefano Zampini + ia - the CSR row pointers 43145b7e41feSStefano Zampini - ja - the CSR column indices 43155b7e41feSStefano Zampini 43165b7e41feSStefano Zampini Level: developer 43175b7e41feSStefano Zampini 43185b7e41feSStefano Zampini Notes: 43195b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 43205b7e41feSStefano Zampini 4321db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 43225b7e41feSStefano Zampini @*/ 43239371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 43245f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 43255f101d05SStefano Zampini CsrMatrix *csr; 43265f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 43275f101d05SStefano Zampini 43285f101d05SStefano Zampini PetscFunctionBegin; 43295f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 43305f101d05SStefano Zampini if (!i || !j) PetscFunctionReturn(0); 43315f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4332aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 43339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 433428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 43355f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 43365f101d05SStefano Zampini if (i) { 43375f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 43385f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 43395f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 43405f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 43419566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 43425f101d05SStefano Zampini } 43435f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 43445f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 43455f101d05SStefano Zampini } 43465f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 43475f101d05SStefano Zampini PetscFunctionReturn(0); 43485f101d05SStefano Zampini } 43495f101d05SStefano Zampini 43505b7e41feSStefano Zampini /*@C 43515b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ() 43525b7e41feSStefano Zampini 43535b7e41feSStefano Zampini Not collective 43545b7e41feSStefano Zampini 43555b7e41feSStefano Zampini Input Parameters: 43565b7e41feSStefano Zampini + A - the matrix 43575b7e41feSStefano Zampini - compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form 43585b7e41feSStefano Zampini 43595b7e41feSStefano Zampini Output Parameters: 43605b7e41feSStefano Zampini + ia - the CSR row pointers 43615b7e41feSStefano Zampini - ja - the CSR column indices 43625b7e41feSStefano Zampini 43635b7e41feSStefano Zampini Level: developer 43645b7e41feSStefano Zampini 4365db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()` 43665b7e41feSStefano Zampini @*/ 43679371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) { 43685f101d05SStefano Zampini PetscFunctionBegin; 43695f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 43705f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 43715f101d05SStefano Zampini if (i) *i = NULL; 43725f101d05SStefano Zampini if (j) *j = NULL; 43735f101d05SStefano Zampini PetscFunctionReturn(0); 43745f101d05SStefano Zampini } 43755f101d05SStefano Zampini 43765b7e41feSStefano Zampini /*@C 43775b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 43785b7e41feSStefano Zampini 43795b7e41feSStefano Zampini Not Collective 43805b7e41feSStefano Zampini 43815b7e41feSStefano Zampini Input Parameter: 43825b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 43835b7e41feSStefano Zampini 43845b7e41feSStefano Zampini Output Parameter: 43855b7e41feSStefano Zampini . a - pointer to the device data 43865b7e41feSStefano Zampini 43875b7e41feSStefano Zampini Level: developer 43885b7e41feSStefano Zampini 43895b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 43905b7e41feSStefano Zampini 4391db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 43925b7e41feSStefano Zampini @*/ 43939371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) { 4394ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4395ed502f03SStefano Zampini CsrMatrix *csr; 4396ed502f03SStefano Zampini 4397ed502f03SStefano Zampini PetscFunctionBegin; 4398ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4399ed502f03SStefano Zampini PetscValidPointer(a, 2); 4400ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4401aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 440328b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4404ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 440528b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4406ed502f03SStefano Zampini *a = csr->values->data().get(); 4407ed502f03SStefano Zampini PetscFunctionReturn(0); 4408ed502f03SStefano Zampini } 4409ed502f03SStefano Zampini 44105b7e41feSStefano Zampini /*@C 44115b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead() 44125b7e41feSStefano Zampini 44135b7e41feSStefano Zampini Not Collective 44145b7e41feSStefano Zampini 44155b7e41feSStefano Zampini Input Parameter: 44165b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 44175b7e41feSStefano Zampini 44185b7e41feSStefano Zampini Output Parameter: 44195b7e41feSStefano Zampini . a - pointer to the device data 44205b7e41feSStefano Zampini 44215b7e41feSStefano Zampini Level: developer 44225b7e41feSStefano Zampini 4423db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()` 44245b7e41feSStefano Zampini @*/ 44259371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) { 4426ed502f03SStefano Zampini PetscFunctionBegin; 4427ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4428ed502f03SStefano Zampini PetscValidPointer(a, 2); 4429ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4430ed502f03SStefano Zampini *a = NULL; 4431ed502f03SStefano Zampini PetscFunctionReturn(0); 4432ed502f03SStefano Zampini } 4433ed502f03SStefano Zampini 44345b7e41feSStefano Zampini /*@C 44355b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 44365b7e41feSStefano Zampini 44375b7e41feSStefano Zampini Not Collective 44385b7e41feSStefano Zampini 44395b7e41feSStefano Zampini Input Parameter: 44405b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 44415b7e41feSStefano Zampini 44425b7e41feSStefano Zampini Output Parameter: 44435b7e41feSStefano Zampini . a - pointer to the device data 44445b7e41feSStefano Zampini 44455b7e41feSStefano Zampini Level: developer 44465b7e41feSStefano Zampini 44475b7e41feSStefano Zampini Notes: may trigger host-device copies if up-to-date matrix data is on host 44485b7e41feSStefano Zampini 4449db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 44505b7e41feSStefano Zampini @*/ 44519371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) { 4452039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4453039c6fbaSStefano Zampini CsrMatrix *csr; 4454039c6fbaSStefano Zampini 4455039c6fbaSStefano Zampini PetscFunctionBegin; 4456039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4457039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4458039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4459aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 446128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4462039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 446328b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4464039c6fbaSStefano Zampini *a = csr->values->data().get(); 4465039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 44669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4467039c6fbaSStefano Zampini PetscFunctionReturn(0); 4468039c6fbaSStefano Zampini } 44695b7e41feSStefano Zampini /*@C 44705b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray() 4471039c6fbaSStefano Zampini 44725b7e41feSStefano Zampini Not Collective 44735b7e41feSStefano Zampini 44745b7e41feSStefano Zampini Input Parameter: 44755b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 44765b7e41feSStefano Zampini 44775b7e41feSStefano Zampini Output Parameter: 44785b7e41feSStefano Zampini . a - pointer to the device data 44795b7e41feSStefano Zampini 44805b7e41feSStefano Zampini Level: developer 44815b7e41feSStefano Zampini 4482db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()` 44835b7e41feSStefano Zampini @*/ 44849371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) { 4485039c6fbaSStefano Zampini PetscFunctionBegin; 4486039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4487039c6fbaSStefano Zampini PetscValidPointer(a, 2); 4488039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 44899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 44909566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4491039c6fbaSStefano Zampini *a = NULL; 4492039c6fbaSStefano Zampini PetscFunctionReturn(0); 4493039c6fbaSStefano Zampini } 4494039c6fbaSStefano Zampini 44955b7e41feSStefano Zampini /*@C 44965b7e41feSStefano Zampini MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored 44975b7e41feSStefano Zampini 44985b7e41feSStefano Zampini Not Collective 44995b7e41feSStefano Zampini 45005b7e41feSStefano Zampini Input Parameter: 45015b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 45025b7e41feSStefano Zampini 45035b7e41feSStefano Zampini Output Parameter: 45045b7e41feSStefano Zampini . a - pointer to the device data 45055b7e41feSStefano Zampini 45065b7e41feSStefano Zampini Level: developer 45075b7e41feSStefano Zampini 45085b7e41feSStefano Zampini Notes: does not trigger host-device copies and flags data validity on the GPU 45095b7e41feSStefano Zampini 4510db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 45115b7e41feSStefano Zampini @*/ 45129371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) { 4513ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4514ed502f03SStefano Zampini CsrMatrix *csr; 4515ed502f03SStefano Zampini 4516ed502f03SStefano Zampini PetscFunctionBegin; 4517ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4518ed502f03SStefano Zampini PetscValidPointer(a, 2); 4519ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4520aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 452128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4522ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 452328b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4524ed502f03SStefano Zampini *a = csr->values->data().get(); 4525039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 45269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4527ed502f03SStefano Zampini PetscFunctionReturn(0); 4528ed502f03SStefano Zampini } 4529ed502f03SStefano Zampini 45305b7e41feSStefano Zampini /*@C 45315b7e41feSStefano Zampini MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite() 45325b7e41feSStefano Zampini 45335b7e41feSStefano Zampini Not Collective 45345b7e41feSStefano Zampini 45355b7e41feSStefano Zampini Input Parameter: 45365b7e41feSStefano Zampini . A - a MATSEQAIJCUSPARSE matrix 45375b7e41feSStefano Zampini 45385b7e41feSStefano Zampini Output Parameter: 45395b7e41feSStefano Zampini . a - pointer to the device data 45405b7e41feSStefano Zampini 45415b7e41feSStefano Zampini Level: developer 45425b7e41feSStefano Zampini 4543db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()` 45445b7e41feSStefano Zampini @*/ 45459371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) { 4546ed502f03SStefano Zampini PetscFunctionBegin; 4547ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4548ed502f03SStefano Zampini PetscValidPointer(a, 2); 4549ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 45509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 45519566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4552ed502f03SStefano Zampini *a = NULL; 4553ed502f03SStefano Zampini PetscFunctionReturn(0); 4554ed502f03SStefano Zampini } 4555ed502f03SStefano Zampini 45569371c9d4SSatish Balay struct IJCompare4 { 45579371c9d4SSatish Balay __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) { 4558ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 4559ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 4560ed502f03SStefano Zampini return false; 4561ed502f03SStefano Zampini } 4562ed502f03SStefano Zampini }; 4563ed502f03SStefano Zampini 45649371c9d4SSatish Balay struct Shift { 4565ed502f03SStefano Zampini int _shift; 4566ed502f03SStefano Zampini 4567ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 45689371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4569ed502f03SStefano Zampini }; 4570ed502f03SStefano Zampini 4571ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */ 45729371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 4573ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4574ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4575ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4576ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4577ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4578ed502f03SStefano Zampini cusparseStatus_t stat; 4579ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4580ed502f03SStefano Zampini 4581ed502f03SStefano Zampini PetscFunctionBegin; 4582ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4583ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4584ed502f03SStefano Zampini PetscValidPointer(C, 4); 4585ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4586ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 45875f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 458808401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4589aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4590aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4591ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4592ed502f03SStefano Zampini m = A->rmap->n; 4593ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 45949566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 45959566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 45969566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4597ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4598ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4599ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4600ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4601ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4602ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4603ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4604ed502f03SStefano Zampini c->compressedrow.i = NULL; 4605ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4606ed502f03SStefano Zampini Ccusp->workVector = NULL; 4607ed502f03SStefano Zampini Ccusp->nrows = m; 4608ed502f03SStefano Zampini Ccusp->mat = Cmat; 4609ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4610ed502f03SStefano Zampini Ccsr->num_rows = m; 4611ed502f03SStefano Zampini Ccsr->num_cols = n; 46129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 46139566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 46149566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 46159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar))); 46169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar))); 46179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar))); 46189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 46219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 46229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 462328b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 462428b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4625ed502f03SStefano Zampini 4626ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4627ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4628ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4629ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4630ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4631ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4632ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4633ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4634ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 4635ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 4636ed502f03SStefano Zampini if (c->nz) { 46372ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 46382ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 46392ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 46402ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 46412ed87e7eSStefano Zampini 4642ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4643ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4644ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4645ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 46469566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4647ed502f03SStefano Zampini } 46482ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 46492ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4650ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4651ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4652ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4653ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 46549566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4655ed502f03SStefano Zampini } 46562ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 46572ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 46589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 46599371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 46609371c9d4SSatish Balay PetscCallCUSPARSE(stat); 46619371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 46629371c9d4SSatish Balay PetscCallCUSPARSE(stat); 46632ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 46642ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 46652ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 46668909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4667ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4668ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 46698909a122SStefano Zampini #else 46708909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 46718909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 46728909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 46738909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 46748909a122SStefano Zampini #endif 46752ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 46762ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 46772ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 46782ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 46792ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 46802ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4681ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 4682ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 4683ed502f03SStefano Zampini thrust::advance(p2, Annz); 4684792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 46858909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 46868909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 46878909a122SStefano Zampini #endif 46882ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 46892ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 46902ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4691792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 46922ed87e7eSStefano Zampini #else 46932ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 4694792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4695792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 46962ed87e7eSStefano Zampini #endif 46979371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 46989371c9d4SSatish Balay PetscCallCUSPARSE(stat); 46999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 47002ed87e7eSStefano Zampini delete wPerm; 47012ed87e7eSStefano Zampini delete Acoo; 47022ed87e7eSStefano Zampini delete Bcoo; 47032ed87e7eSStefano Zampini delete Ccoo; 4704ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 47059371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 47069371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4707ed502f03SStefano Zampini #endif 47081a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 47099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 47109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4711ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4712ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4713ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4714ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4715ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4716ed502f03SStefano Zampini 47171a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 47181a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4719a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4720ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4721ed502f03SStefano Zampini CmatT->mat = CcsrT; 4722ed502f03SStefano Zampini CcsrT->num_rows = n; 4723ed502f03SStefano Zampini CcsrT->num_cols = m; 4724ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4725ed502f03SStefano Zampini 4726ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4727ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4728ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4729ed502f03SStefano Zampini 47309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4731ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4732ed502f03SStefano Zampini if (AT) { 4733ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4734ed502f03SStefano Zampini thrust::advance(rT, -1); 4735ed502f03SStefano Zampini } 4736ed502f03SStefano Zampini if (BT) { 4737ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4738ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4739ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4740ed502f03SStefano Zampini } 4741ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4742ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4743ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4744ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4745ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4746ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 47479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4748ed502f03SStefano Zampini 47499566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 47509566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 47519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 47529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar))); 47539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar))); 47549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar))); 47559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4758ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 47599371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 47609371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4761ed502f03SStefano Zampini #endif 4762ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4763ed502f03SStefano Zampini } 4764ed502f03SStefano Zampini } 4765ed502f03SStefano Zampini 4766ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 4767ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 4768ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 47699566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m + 1, &c->i)); 47709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->j)); 4771ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 4772ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4773ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4774ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4775ed502f03SStefano Zampini jj = *Ccsr->column_indices; 47769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 47779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4778ed502f03SStefano Zampini } else { 47799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 47809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4781ed502f03SStefano Zampini } 47829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 47839566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 47849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4785ed502f03SStefano Zampini c->maxnz = c->nz; 4786ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4787ed502f03SStefano Zampini c->rmax = 0; 4788ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4789ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4790ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4791ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt) !!nn; 4792ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4793ed502f03SStefano Zampini } 47949566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 47959566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4796ed502f03SStefano Zampini (*C)->nonzerostate++; 47979566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 47989566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4799ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4800ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4801ed502f03SStefano Zampini } else { 480208401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4803ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4804ed502f03SStefano Zampini if (c->nz) { 4805ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 48065f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm"); 4807aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 480808401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 48099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 48109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 48115f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 48125f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4813ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4814ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4815ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4816aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4817aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4818aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4819aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 48205f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size()); 4821ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 4822ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 48239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 48249371c9d4SSatish Balay auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin()))); 48259371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4826ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 48279371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 48289371c9d4SSatish Balay auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end()))); 4829ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 48309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 48311a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 48325f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4833ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4834ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4835ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4836ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4837ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4838ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4839ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48401a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4841ed502f03SStefano Zampini } 48429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4843ed502f03SStefano Zampini } 4844ed502f03SStefano Zampini } 48459566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4846ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4847ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4848ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4849ed502f03SStefano Zampini PetscFunctionReturn(0); 4850ed502f03SStefano Zampini } 4851c215019aSStefano Zampini 48529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) { 4853c215019aSStefano Zampini bool dmem; 4854c215019aSStefano Zampini const PetscScalar *av; 4855c215019aSStefano Zampini 4856c215019aSStefano Zampini PetscFunctionBegin; 4857c215019aSStefano Zampini dmem = isCudaMem(v); 48589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4859c215019aSStefano Zampini if (n && idx) { 4860c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4861c215019aSStefano Zampini widx.assign(idx, idx + n); 48629566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4863c215019aSStefano Zampini 4864c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4865c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4866c215019aSStefano Zampini if (dmem) { 4867c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4868c215019aSStefano Zampini } else { 4869c215019aSStefano Zampini w = new THRUSTARRAY(n); 4870c215019aSStefano Zampini dv = w->data(); 4871c215019aSStefano Zampini } 4872c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 4873c215019aSStefano Zampini 4874c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 4875c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 4876c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 4877*48a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 4878c215019aSStefano Zampini delete w; 4879c215019aSStefano Zampini } else { 48809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 4881c215019aSStefano Zampini } 48829566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 48839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 4884c215019aSStefano Zampini PetscFunctionReturn(0); 4885c215019aSStefano Zampini } 4886