19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 69ae82921SPaul Mullowney 73d13b8fdSMatthew G. Knepley #include <petscconf.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 11af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 129ae82921SPaul Mullowney #undef VecType 133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14a2cee5feSJed Brown #include <thrust/adjacent_difference.h> 15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14 16d0967f54SJacob Faibussowitsch #define PETSC_HAVE_THRUST_ASYNC 1 17d0967f54SJacob Faibussowitsch // thrust::for_each(thrust::cuda::par.on()) requires C++14 18d0967f54SJacob Faibussowitsch #endif 19a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h> 20a2cee5feSJed Brown #include <thrust/remove.h> 21a2cee5feSJed Brown #include <thrust/sort.h> 22a2cee5feSJed Brown #include <thrust/unique.h> 2359c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 2459c3d2bbSPierre Jolivet #include <cuda/std/functional> 2559c3d2bbSPierre Jolivet #endif 26e8d2b73aSMark Adams 27e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2946aba097SBarry Smith /* 3046aba097SBarry Smith The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32afb2bd1cSJunchao Zhang */ 33afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36afb2bd1cSJunchao Zhang #endif 379ae82921SPaul Mullowney 38087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 416fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 436fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 466fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48d460d7bfSJunchao Zhang #endif 49ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 5133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 526fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 536fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 546fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 599ae82921SPaul Mullowney 607f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 632c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 647f756511SDominic Meiser 6557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 6757181aedSStefano Zampini 68c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71c215019aSStefano Zampini 72d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73d71ae5a4SJacob Faibussowitsch { 74aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 756e111a19SKarl Rupp 76ca45077fSPaul Mullowney PetscFunctionBegin; 77ca45077fSPaul Mullowney switch (op) { 78d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT: 79d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 80d71ae5a4SJacob Faibussowitsch break; 81d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_ALL: 82d71ae5a4SJacob Faibussowitsch cusparsestruct->format = format; 83d71ae5a4SJacob Faibussowitsch break; 84d71ae5a4SJacob Faibussowitsch default: 85d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86ca45077fSPaul Mullowney } 873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 88ca45077fSPaul Mullowney } 899ae82921SPaul Mullowney 90e057df02SPaul Mullowney /*@ 9111a5261eSBarry Smith MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 9211a5261eSBarry Smith operation. Only the `MatMult()` operation can use different GPU storage formats 9311a5261eSBarry Smith 94e057df02SPaul Mullowney Not Collective 95e057df02SPaul Mullowney 96e057df02SPaul Mullowney Input Parameters: 9711a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 982ef1f0ffSBarry Smith . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 992ef1f0ffSBarry Smith `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 10011a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101e057df02SPaul Mullowney 102e057df02SPaul Mullowney Level: intermediate 103e057df02SPaul Mullowney 104fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105e057df02SPaul Mullowney @*/ 106d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107d71ae5a4SJacob Faibussowitsch { 108e057df02SPaul Mullowney PetscFunctionBegin; 109e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 1113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 112e057df02SPaul Mullowney } 113e057df02SPaul Mullowney 114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115d71ae5a4SJacob Faibussowitsch { 116365b711fSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117365b711fSMark Adams 118365b711fSMark Adams PetscFunctionBegin; 119365b711fSMark Adams cusparsestruct->use_cpu_solve = use_cpu; 1203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 121365b711fSMark Adams } 122365b711fSMark Adams 123365b711fSMark Adams /*@ 12411a5261eSBarry Smith MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125365b711fSMark Adams 126365b711fSMark Adams Input Parameters: 12711a5261eSBarry Smith + A - Matrix of type `MATSEQAIJCUSPARSE` 12811a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()` 129365b711fSMark Adams 1302ef1f0ffSBarry Smith Level: intermediate 131365b711fSMark Adams 13211a5261eSBarry Smith Note: 13353220ed8SBarry Smith The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method 13453220ed8SBarry Smith and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there. 135365b711fSMark Adams This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136365b711fSMark Adams 1371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138365b711fSMark Adams @*/ 139d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140d71ae5a4SJacob Faibussowitsch { 141365b711fSMark Adams PetscFunctionBegin; 142365b711fSMark Adams PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143cac4c232SBarry Smith PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 1443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145365b711fSMark Adams } 146365b711fSMark Adams 14766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148d71ae5a4SJacob Faibussowitsch { 149e6e9a74fSStefano Zampini PetscFunctionBegin; 1501a2c6b5cSJunchao Zhang switch (op) { 1511a2c6b5cSJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 1521a2c6b5cSJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 1539566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1541a2c6b5cSJunchao Zhang A->form_explicit_transpose = flg; 1551a2c6b5cSJunchao Zhang break; 156d71ae5a4SJacob Faibussowitsch default: 157d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158d71ae5a4SJacob Faibussowitsch break; 159e6e9a74fSStefano Zampini } 1603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 161e6e9a74fSStefano Zampini } 162e6e9a74fSStefano Zampini 163ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164d71ae5a4SJacob Faibussowitsch { 165e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 1669ae82921SPaul Mullowney PetscBool flg; 167a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1686e111a19SKarl Rupp 1699ae82921SPaul Mullowney PetscFunctionBegin; 170d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 1719ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 1729371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 1739566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174afb2bd1cSJunchao Zhang 1759371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 1769566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 1779566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 1789566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1809371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184a435da06SStefano Zampini #else 185aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186a435da06SStefano Zampini #endif 1879371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189afb2bd1cSJunchao Zhang 1909371c9d4SSatish Balay PetscCall( 1919371c9d4SSatish Balay PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192aed4548fSBarry Smith PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193afb2bd1cSJunchao Zhang #endif 1944c87dfd4SPaul Mullowney } 195d0609cedSBarry Smith PetscOptionsHeadEnd(); 1963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1979ae82921SPaul Mullowney } 1989ae82921SPaul Mullowney 199b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201d460d7bfSJunchao Zhang { 202d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 204d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 206d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 207d460d7bfSJunchao Zhang PetscInt *Mi, *Mj, Mnz; 208d460d7bfSJunchao Zhang PetscScalar *Ma; 209d460d7bfSJunchao Zhang 210d460d7bfSJunchao Zhang PetscFunctionBegin; 211d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 212d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 213d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 214d460d7bfSJunchao Zhang Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 215d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &Mi)); 216d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 217d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 218d460d7bfSJunchao Zhang Mi[0] = 0; 219d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 220d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 221d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 222d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 223d460d7bfSJunchao Zhang Mj[Mi[i] + llen] = i; // diagonal entry 224d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 225d460d7bfSJunchao Zhang Mi[i + 1] = Mi[i] + llen + ulen; 226d460d7bfSJunchao Zhang } 227d460d7bfSJunchao Zhang // Copy M (L,U) from host to device 228f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 229f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 230f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 231f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 232f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 233d460d7bfSJunchao Zhang 234d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 235d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 236d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 237d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 238d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 239d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 240d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 241d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 242d460d7bfSJunchao Zhang 243d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 244d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 245d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 246d460d7bfSJunchao Zhang 247d460d7bfSJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 248d460d7bfSJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 249d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 250d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 251d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 252d460d7bfSJunchao Zhang 253d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 254f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 255f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 256d460d7bfSJunchao Zhang 257d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 258d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 259d460d7bfSJunchao Zhang 260d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 261d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 262d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 263d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 264d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 265d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 266d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 267d460d7bfSJunchao Zhang 268d460d7bfSJunchao Zhang // Record for reuse 269d460d7bfSJunchao Zhang fs->csrRowPtr_h = Mi; 270d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 271d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 272d460d7bfSJunchao Zhang } 273d460d7bfSJunchao Zhang // Copy the value 274d460d7bfSJunchao Zhang Mi = fs->csrRowPtr_h; 275d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 276d460d7bfSJunchao Zhang Mnz = Mi[m]; 277d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 278d460d7bfSJunchao Zhang PetscInt llen = Ai[i + 1] - Ai[i]; 279d460d7bfSJunchao Zhang PetscInt ulen = Adiag[i] - Adiag[i + 1]; 280d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 281d460d7bfSJunchao Zhang Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 282d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 283d460d7bfSJunchao Zhang } 284d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 285d460d7bfSJunchao Zhang 286204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 287204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 288204a0e31SJunchao Zhang // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 289204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 290204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291204a0e31SJunchao Zhang } else 292204a0e31SJunchao Zhang #endif 293204a0e31SJunchao Zhang { 294d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 295d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 296d460d7bfSJunchao Zhang 297d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 298204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 299d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 300d460d7bfSJunchao Zhang } 301204a0e31SJunchao Zhang } 302d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 303d460d7bfSJunchao Zhang } 304d460d7bfSJunchao Zhang #else 305d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 306d71ae5a4SJacob Faibussowitsch { 3079ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3089ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3099ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 310aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 3119ae82921SPaul Mullowney const PetscInt *ai = a->i, *aj = a->j, *vi; 3129ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 3139ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3149ae82921SPaul Mullowney PetscInt i, nz, nzLower, offset, rowOffset; 3159ae82921SPaul Mullowney 3169ae82921SPaul Mullowney PetscFunctionBegin; 3173ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 318c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3199ae82921SPaul Mullowney try { 3209ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3219ae82921SPaul Mullowney nzLower = n + ai[n] - ai[1]; 322da79fbbcSStefano Zampini if (!loTriFactor) { 3232cbc15d9SMark PetscScalar *AALo; 3242cbc15d9SMark 3259566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 3269ae82921SPaul Mullowney 3279ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 3289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 3299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 3309ae82921SPaul Mullowney 3319ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3329ae82921SPaul Mullowney AiLo[0] = (PetscInt)0; 3339ae82921SPaul Mullowney AiLo[n] = nzLower; 3349ae82921SPaul Mullowney AjLo[0] = (PetscInt)0; 3359ae82921SPaul Mullowney AALo[0] = (MatScalar)1.0; 3369ae82921SPaul Mullowney v = aa; 3379ae82921SPaul Mullowney vi = aj; 3389ae82921SPaul Mullowney offset = 1; 3399ae82921SPaul Mullowney rowOffset = 1; 3409ae82921SPaul Mullowney for (i = 1; i < n; i++) { 3419ae82921SPaul Mullowney nz = ai[i + 1] - ai[i]; 342e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3439ae82921SPaul Mullowney AiLo[i] = rowOffset; 3449ae82921SPaul Mullowney rowOffset += nz + 1; 3459ae82921SPaul Mullowney 346f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 347f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 3489ae82921SPaul Mullowney 3499ae82921SPaul Mullowney offset += nz; 3509ae82921SPaul Mullowney AjLo[offset] = (PetscInt)i; 3519ae82921SPaul Mullowney AALo[offset] = (MatScalar)1.0; 3529ae82921SPaul Mullowney offset += 1; 3539ae82921SPaul Mullowney 3549ae82921SPaul Mullowney v += nz; 3559ae82921SPaul Mullowney vi += nz; 3569ae82921SPaul Mullowney } 3572205254eSKarl Rupp 358aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 3599566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 360da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 361aa372e3fSPaul Mullowney /* Create the matrix description */ 3629566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 3639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 3641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 366afb2bd1cSJunchao Zhang #else 3679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 368afb2bd1cSJunchao Zhang #endif 3699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 3709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 371aa372e3fSPaul Mullowney 372aa372e3fSPaul Mullowney /* set the operation */ 373aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 374aa372e3fSPaul Mullowney 375aa372e3fSPaul Mullowney /* set the matrix */ 376aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 377aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 378aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 379aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 380aa372e3fSPaul Mullowney 381aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 382aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 383aa372e3fSPaul Mullowney 384aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 385aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 386aa372e3fSPaul Mullowney 387aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 388aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 389aa372e3fSPaul Mullowney 390afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 3919566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 392261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 3931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 3949371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 3959371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 3969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 397afb2bd1cSJunchao Zhang #endif 398afb2bd1cSJunchao Zhang 399aa372e3fSPaul Mullowney /* perform the solve analysis */ 4009371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 4019f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 4029566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 4039566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 404aa372e3fSPaul Mullowney 405da79fbbcSStefano Zampini /* assign the pointer */ 406aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 4072cbc15d9SMark loTriFactor->AA_h = AALo; 4089566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiLo)); 4099566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjLo)); 4109566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 411da79fbbcSStefano Zampini } else { /* update values only */ 41248a46eb9SPierre Jolivet if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 413da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4142cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 415da79fbbcSStefano Zampini v = aa; 416da79fbbcSStefano Zampini vi = aj; 417da79fbbcSStefano Zampini offset = 1; 418da79fbbcSStefano Zampini for (i = 1; i < n; i++) { 419da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i]; 420f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 421da79fbbcSStefano Zampini offset += nz; 4222cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 423da79fbbcSStefano Zampini offset += 1; 424da79fbbcSStefano Zampini v += nz; 425da79fbbcSStefano Zampini } 4262cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 4279566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 428da79fbbcSStefano Zampini } 429d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 430d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 431d71ae5a4SJacob Faibussowitsch } 4329ae82921SPaul Mullowney } 4333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4349ae82921SPaul Mullowney } 4359ae82921SPaul Mullowney 436d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 437d71ae5a4SJacob Faibussowitsch { 4389ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4399ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4409ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 441aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 4429ae82921SPaul Mullowney const PetscInt *aj = a->j, *adiag = a->diag, *vi; 4439ae82921SPaul Mullowney const MatScalar *aa = a->a, *v; 4449ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4459ae82921SPaul Mullowney PetscInt i, nz, nzUpper, offset; 4469ae82921SPaul Mullowney 4479ae82921SPaul Mullowney PetscFunctionBegin; 4483ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 449c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4509ae82921SPaul Mullowney try { 4519ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4529ae82921SPaul Mullowney nzUpper = adiag[0] - adiag[n]; 453da79fbbcSStefano Zampini if (!upTriFactor) { 4542cbc15d9SMark PetscScalar *AAUp; 4552cbc15d9SMark 4569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 4572cbc15d9SMark 4589ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 4599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 4609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 4619ae82921SPaul Mullowney 4629ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4639ae82921SPaul Mullowney AiUp[0] = (PetscInt)0; 4649ae82921SPaul Mullowney AiUp[n] = nzUpper; 4659ae82921SPaul Mullowney offset = nzUpper; 4669ae82921SPaul Mullowney for (i = n - 1; i >= 0; i--) { 4679ae82921SPaul Mullowney v = aa + adiag[i + 1] + 1; 4689ae82921SPaul Mullowney vi = aj + adiag[i + 1] + 1; 4699ae82921SPaul Mullowney 470e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4719ae82921SPaul Mullowney nz = adiag[i] - adiag[i + 1] - 1; 4729ae82921SPaul Mullowney 473e057df02SPaul Mullowney /* decrement the offset */ 4749ae82921SPaul Mullowney offset -= (nz + 1); 4759ae82921SPaul Mullowney 476e057df02SPaul Mullowney /* first, set the diagonal elements */ 4779ae82921SPaul Mullowney AjUp[offset] = (PetscInt)i; 47809f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1. / v[nz]; 4799ae82921SPaul Mullowney AiUp[i] = AiUp[i + 1] - (nz + 1); 4809ae82921SPaul Mullowney 481f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 482f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 4839ae82921SPaul Mullowney } 4842205254eSKarl Rupp 485aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 4869566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 487da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 4882205254eSKarl Rupp 489aa372e3fSPaul Mullowney /* Create the matrix description */ 4909566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 4919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 4921b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 4939566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 494afb2bd1cSJunchao Zhang #else 4959566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 496afb2bd1cSJunchao Zhang #endif 4979566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 4989566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 499aa372e3fSPaul Mullowney 500aa372e3fSPaul Mullowney /* set the operation */ 501aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502aa372e3fSPaul Mullowney 503aa372e3fSPaul Mullowney /* set the matrix */ 504aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 505aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 506aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 507aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 508aa372e3fSPaul Mullowney 509aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 510aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 511aa372e3fSPaul Mullowney 512aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 513aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 514aa372e3fSPaul Mullowney 515aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 516aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 517aa372e3fSPaul Mullowney 518afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 5199566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 520261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 5211b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 5229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5239371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 5249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 525afb2bd1cSJunchao Zhang #endif 526afb2bd1cSJunchao Zhang 527aa372e3fSPaul Mullowney /* perform the solve analysis */ 5289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 5299f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 5309f7ba44dSJacob Faibussowitsch 5319566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 5329566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 533aa372e3fSPaul Mullowney 534da79fbbcSStefano Zampini /* assign the pointer */ 535aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 5362cbc15d9SMark upTriFactor->AA_h = AAUp; 5379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 5389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 5399566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 540da79fbbcSStefano Zampini } else { 54148a46eb9SPierre Jolivet if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 542da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 543da79fbbcSStefano Zampini offset = nzUpper; 544da79fbbcSStefano Zampini for (i = n - 1; i >= 0; i--) { 545da79fbbcSStefano Zampini v = aa + adiag[i + 1] + 1; 546da79fbbcSStefano Zampini 547da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 548da79fbbcSStefano Zampini nz = adiag[i] - adiag[i + 1] - 1; 549da79fbbcSStefano Zampini 550da79fbbcSStefano Zampini /* decrement the offset */ 551da79fbbcSStefano Zampini offset -= (nz + 1); 552da79fbbcSStefano Zampini 553da79fbbcSStefano Zampini /* first, set the diagonal elements */ 5542cbc15d9SMark upTriFactor->AA_h[offset] = 1. / v[nz]; 555f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 556da79fbbcSStefano Zampini } 5572cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 5589566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 559da79fbbcSStefano Zampini } 560d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 561d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 562d71ae5a4SJacob Faibussowitsch } 5639ae82921SPaul Mullowney } 5643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5659ae82921SPaul Mullowney } 566d460d7bfSJunchao Zhang #endif 5679ae82921SPaul Mullowney 568d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 569d71ae5a4SJacob Faibussowitsch { 5709ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 5719ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572c9e33d71SJunchao Zhang IS isrow = a->row, isicol = a->icol; 5739ae82921SPaul Mullowney PetscBool row_identity, col_identity; 5749ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5759ae82921SPaul Mullowney 5769ae82921SPaul Mullowney PetscFunctionBegin; 57728b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 578b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 579d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 580d460d7bfSJunchao Zhang #else 5819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 5829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 583ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 584d460d7bfSJunchao Zhang #endif 585d460d7bfSJunchao Zhang 586aa372e3fSPaul Mullowney cusparseTriFactors->nnz = a->nz; 5879ae82921SPaul Mullowney 588d460d7bfSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 589e057df02SPaul Mullowney /* lower triangular indices */ 5909566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 591da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 592da79fbbcSStefano Zampini const PetscInt *r; 593da79fbbcSStefano Zampini 5949566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 595aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 596aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r + n); 5979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 5989566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 599da79fbbcSStefano Zampini } 6009ae82921SPaul Mullowney 601e057df02SPaul Mullowney /* upper triangular indices */ 602c9e33d71SJunchao Zhang PetscCall(ISIdentity(isicol, &col_identity)); 603da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 604da79fbbcSStefano Zampini const PetscInt *c; 605da79fbbcSStefano Zampini 606c9e33d71SJunchao Zhang PetscCall(ISGetIndices(isicol, &c)); 607aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 608aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c + n); 609c9e33d71SJunchao Zhang PetscCall(ISRestoreIndices(isicol, &c)); 6109566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 611da79fbbcSStefano Zampini } 6123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6139ae82921SPaul Mullowney } 6149ae82921SPaul Mullowney 615b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 616*5c7eeb11SPierre Jolivet static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(Mat A) 617d460d7bfSJunchao Zhang { 618d460d7bfSJunchao Zhang Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 619d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 620d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 621d460d7bfSJunchao Zhang const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 622d460d7bfSJunchao Zhang const MatScalar *Aa = a->a; 623d460d7bfSJunchao Zhang PetscInt *Mj, Mnz; 624d460d7bfSJunchao Zhang PetscScalar *Ma, *D; 625d460d7bfSJunchao Zhang 626d460d7bfSJunchao Zhang PetscFunctionBegin; 627d460d7bfSJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 628d460d7bfSJunchao Zhang if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 629d460d7bfSJunchao Zhang // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 630d460d7bfSJunchao Zhang // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 631d460d7bfSJunchao Zhang Mnz = Ai[m]; // Unz (with the unit diagonal) 632d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Ma)); 633d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 634d460d7bfSJunchao Zhang PetscCall(PetscMalloc1(m, &D)); // the diagonal 635d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 636d460d7bfSJunchao Zhang PetscInt ulen = Ai[i + 1] - Ai[i]; 637d460d7bfSJunchao Zhang Mj[Ai[i]] = i; // diagonal entry 638d460d7bfSJunchao Zhang PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 639d460d7bfSJunchao Zhang } 640d460d7bfSJunchao Zhang // Copy M (U) from host to device 641f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 642f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 643f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 644f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 645d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 646d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 647d460d7bfSJunchao Zhang 648d460d7bfSJunchao Zhang // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 649d460d7bfSJunchao Zhang // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 650d460d7bfSJunchao Zhang // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 651d460d7bfSJunchao Zhang // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 652d460d7bfSJunchao Zhang // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 653d460d7bfSJunchao Zhang cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 654d460d7bfSJunchao Zhang cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 655d460d7bfSJunchao Zhang const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 656d460d7bfSJunchao Zhang 657d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 658d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 659d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 660d460d7bfSJunchao Zhang 661d460d7bfSJunchao Zhang // Allocate work vectors in SpSv 662f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 663f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 664d460d7bfSJunchao Zhang 665d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 666d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 667d460d7bfSJunchao Zhang 668d460d7bfSJunchao Zhang // Query buffer sizes for SpSV and then allocate buffers 669d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 670d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 671d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 672d460d7bfSJunchao Zhang 673aaa8cc7dSPierre Jolivet PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 674d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 675d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 676d460d7bfSJunchao Zhang 677d460d7bfSJunchao Zhang // Record for reuse 678d460d7bfSJunchao Zhang fs->csrVal_h = Ma; 679d460d7bfSJunchao Zhang fs->diag_h = D; 680d460d7bfSJunchao Zhang PetscCall(PetscFree(Mj)); 681d460d7bfSJunchao Zhang } 682d460d7bfSJunchao Zhang // Copy the value 683d460d7bfSJunchao Zhang Ma = fs->csrVal_h; 684d460d7bfSJunchao Zhang D = fs->diag_h; 685d460d7bfSJunchao Zhang Mnz = Ai[m]; 686d460d7bfSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 687d460d7bfSJunchao Zhang D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 688d460d7bfSJunchao Zhang Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 689d460d7bfSJunchao Zhang for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 690d460d7bfSJunchao Zhang } 691d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 692d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 693d460d7bfSJunchao Zhang 694204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 695204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 696204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 697204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 698204a0e31SJunchao Zhang } else 699204a0e31SJunchao Zhang #endif 700204a0e31SJunchao Zhang { 701d460d7bfSJunchao Zhang // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 702d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 703d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 704204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 705204a0e31SJunchao Zhang } 706d460d7bfSJunchao Zhang } 707d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 708d460d7bfSJunchao Zhang } 709d460d7bfSJunchao Zhang 710d460d7bfSJunchao Zhang // Solve Ut D U x = b 711d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 712d460d7bfSJunchao Zhang { 713d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 714d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 715d460d7bfSJunchao Zhang const PetscScalar *barray; 716d460d7bfSJunchao Zhang PetscScalar *xarray; 717d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 718d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 719d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 720d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 721d460d7bfSJunchao Zhang 722d460d7bfSJunchao Zhang PetscFunctionBegin; 723d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 724d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 725d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 726d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 727d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 728d460d7bfSJunchao Zhang 729d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 730d460d7bfSJunchao Zhang if (fs->rpermIndices) { 731d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 732d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 733d460d7bfSJunchao Zhang } else { 734d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 735d460d7bfSJunchao Zhang } 736d460d7bfSJunchao Zhang 737d460d7bfSJunchao Zhang // Solve Ut Y = X 738d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 739d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 740d460d7bfSJunchao Zhang 741d460d7bfSJunchao Zhang // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 742d460d7bfSJunchao Zhang // It is basically a vector element-wise multiplication, but cublas does not have it! 743d460d7bfSJunchao Zhang PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 744d460d7bfSJunchao Zhang 745d460d7bfSJunchao Zhang // Solve U X = Y 746d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 747d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 748d460d7bfSJunchao Zhang } else { 749d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 750d460d7bfSJunchao Zhang } 751d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 752d460d7bfSJunchao Zhang 753d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 754d460d7bfSJunchao Zhang if (fs->cpermIndices) { 755d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 756d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 757d460d7bfSJunchao Zhang } 758d460d7bfSJunchao Zhang 759d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 760d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 761d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 762d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 763d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 764d460d7bfSJunchao Zhang } 765d460d7bfSJunchao Zhang #else 766d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 767d71ae5a4SJacob Faibussowitsch { 768087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 769087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 770aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 771aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 772087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 773087f3262SPaul Mullowney PetscScalar *AAUp; 774087f3262SPaul Mullowney PetscScalar *AALo; 775087f3262SPaul Mullowney PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 776087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 777087f3262SPaul Mullowney const PetscInt *ai = b->i, *aj = b->j, *vj; 778087f3262SPaul Mullowney const MatScalar *aa = b->a, *v; 779087f3262SPaul Mullowney 780087f3262SPaul Mullowney PetscFunctionBegin; 7813ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 782c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783087f3262SPaul Mullowney try { 7849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 7859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 786da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 787087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 7889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 7899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 790087f3262SPaul Mullowney 791087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 792087f3262SPaul Mullowney AiUp[0] = (PetscInt)0; 793087f3262SPaul Mullowney AiUp[n] = nzUpper; 794087f3262SPaul Mullowney offset = 0; 795087f3262SPaul Mullowney for (i = 0; i < n; i++) { 796087f3262SPaul Mullowney /* set the pointers */ 797087f3262SPaul Mullowney v = aa + ai[i]; 798087f3262SPaul Mullowney vj = aj + ai[i]; 799087f3262SPaul Mullowney nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 800087f3262SPaul Mullowney 801087f3262SPaul Mullowney /* first, set the diagonal elements */ 802087f3262SPaul Mullowney AjUp[offset] = (PetscInt)i; 80309f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0 / v[nz]; 804087f3262SPaul Mullowney AiUp[i] = offset; 80509f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0 / v[nz]; 806087f3262SPaul Mullowney 807087f3262SPaul Mullowney offset += 1; 808087f3262SPaul Mullowney if (nz > 0) { 809f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 810f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 811087f3262SPaul Mullowney for (j = offset; j < offset + nz; j++) { 812087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 813087f3262SPaul Mullowney AALo[j] = AAUp[j] / v[nz]; 814087f3262SPaul Mullowney } 815087f3262SPaul Mullowney offset += nz; 816087f3262SPaul Mullowney } 817087f3262SPaul Mullowney } 818087f3262SPaul Mullowney 819aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8209566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactor)); 821da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822087f3262SPaul Mullowney 823aa372e3fSPaul Mullowney /* Create the matrix description */ 8249566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 8259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8261b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8279566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828afb2bd1cSJunchao Zhang #else 8299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830afb2bd1cSJunchao Zhang #endif 8319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 833087f3262SPaul Mullowney 834aa372e3fSPaul Mullowney /* set the matrix */ 835aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 836aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 837aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 838aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 839aa372e3fSPaul Mullowney 840aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 841aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 842aa372e3fSPaul Mullowney 843aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 845aa372e3fSPaul Mullowney 846aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 848aa372e3fSPaul Mullowney 849afb2bd1cSJunchao Zhang /* set the operation */ 850afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851afb2bd1cSJunchao Zhang 852afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 8539566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 854261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 8551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8569371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8579371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 8589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 859afb2bd1cSJunchao Zhang #endif 860afb2bd1cSJunchao Zhang 861aa372e3fSPaul Mullowney /* perform the solve analysis */ 8629371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 8639f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 8649f7ba44dSJacob Faibussowitsch 8659566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 8669566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 867aa372e3fSPaul Mullowney 868da79fbbcSStefano Zampini /* assign the pointer */ 869aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 870aa372e3fSPaul Mullowney 871aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 8729566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactor)); 873da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 874aa372e3fSPaul Mullowney 875aa372e3fSPaul Mullowney /* Create the matrix description */ 8769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 8779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 8781b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 8799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 880afb2bd1cSJunchao Zhang #else 8819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 882afb2bd1cSJunchao Zhang #endif 8839566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 8849566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 885aa372e3fSPaul Mullowney 886aa372e3fSPaul Mullowney /* set the operation */ 887aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 888aa372e3fSPaul Mullowney 889aa372e3fSPaul Mullowney /* set the matrix */ 890aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 891aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 892aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 893aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 894aa372e3fSPaul Mullowney 895aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 896aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 897aa372e3fSPaul Mullowney 898aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 899aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 900aa372e3fSPaul Mullowney 901aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 902aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 903aa372e3fSPaul Mullowney 904afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 9059566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 906261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 9071b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 9089371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9099371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 9109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 911afb2bd1cSJunchao Zhang #endif 912afb2bd1cSJunchao Zhang 913aa372e3fSPaul Mullowney /* perform the solve analysis */ 9149371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 9159f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 9169f7ba44dSJacob Faibussowitsch 9179566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 9189566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 919aa372e3fSPaul Mullowney 920da79fbbcSStefano Zampini /* assign the pointer */ 921aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 922087f3262SPaul Mullowney 9239566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 9249566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AiUp)); 9259566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AjUp)); 926da79fbbcSStefano Zampini } else { 927da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 928da79fbbcSStefano Zampini offset = 0; 929da79fbbcSStefano Zampini for (i = 0; i < n; i++) { 930da79fbbcSStefano Zampini /* set the pointers */ 931da79fbbcSStefano Zampini v = aa + ai[i]; 932da79fbbcSStefano Zampini nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 933da79fbbcSStefano Zampini 934da79fbbcSStefano Zampini /* first, set the diagonal elements */ 935da79fbbcSStefano Zampini AAUp[offset] = 1.0 / v[nz]; 936da79fbbcSStefano Zampini AALo[offset] = 1.0 / v[nz]; 937da79fbbcSStefano Zampini 938da79fbbcSStefano Zampini offset += 1; 939da79fbbcSStefano Zampini if (nz > 0) { 940f4f49eeaSPierre Jolivet PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 941da79fbbcSStefano Zampini for (j = offset; j < offset + nz; j++) { 942da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 943da79fbbcSStefano Zampini AALo[j] = AAUp[j] / v[nz]; 944da79fbbcSStefano Zampini } 945da79fbbcSStefano Zampini offset += nz; 946da79fbbcSStefano Zampini } 947da79fbbcSStefano Zampini } 94828b400f6SJacob Faibussowitsch PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 94928b400f6SJacob Faibussowitsch PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 950da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 951da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 9529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 953da79fbbcSStefano Zampini } 9549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AAUp)); 9559566063dSJacob Faibussowitsch PetscCallCUDA(cudaFreeHost(AALo)); 956d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 957d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 958d71ae5a4SJacob Faibussowitsch } 959087f3262SPaul Mullowney } 9603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 961087f3262SPaul Mullowney } 962d460d7bfSJunchao Zhang #endif 963087f3262SPaul Mullowney 964d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 965d71ae5a4SJacob Faibussowitsch { 966087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 967087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 968087f3262SPaul Mullowney IS ip = a->row; 969087f3262SPaul Mullowney PetscBool perm_identity; 970087f3262SPaul Mullowney PetscInt n = A->rmap->n; 971087f3262SPaul Mullowney 972087f3262SPaul Mullowney PetscFunctionBegin; 97328b400f6SJacob Faibussowitsch PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 974d460d7bfSJunchao Zhang 975b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 976*5c7eeb11SPierre Jolivet PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cholesky(A)); 977d460d7bfSJunchao Zhang #else 9789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 979ad540459SPierre Jolivet if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 980d460d7bfSJunchao Zhang #endif 981aa372e3fSPaul Mullowney cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 982aa372e3fSPaul Mullowney 983da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 984da79fbbcSStefano Zampini 985087f3262SPaul Mullowney /* lower triangular indices */ 9869566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 987087f3262SPaul Mullowney if (!perm_identity) { 9884e4bbfaaSStefano Zampini IS iip; 989da79fbbcSStefano Zampini const PetscInt *irip, *rip; 9904e4bbfaaSStefano Zampini 9919566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 9929566063dSJacob Faibussowitsch PetscCall(ISGetIndices(iip, &irip)); 9939566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 994aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 995aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip + n); 996aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9974e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip + n); 9989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iip, &irip)); 9999566063dSJacob Faibussowitsch PetscCall(ISDestroy(&iip)); 10009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 10019566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1002da79fbbcSStefano Zampini } 10033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1004087f3262SPaul Mullowney } 1005087f3262SPaul Mullowney 1006d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1007d71ae5a4SJacob Faibussowitsch { 1008087f3262SPaul Mullowney PetscFunctionBegin; 10099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 10109566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1011ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1012d460d7bfSJunchao Zhang 1013b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1014d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1015d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1016d460d7bfSJunchao Zhang #else 1017087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 1018d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1019d460d7bfSJunchao Zhang IS ip = b->row; 1020d460d7bfSJunchao Zhang PetscBool perm_identity; 1021d460d7bfSJunchao Zhang 10229566063dSJacob Faibussowitsch PetscCall(ISIdentity(ip, &perm_identity)); 1023087f3262SPaul Mullowney if (perm_identity) { 1024087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1025087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1026087f3262SPaul Mullowney } else { 1027087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1028087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1029d460d7bfSJunchao Zhang } 1030d460d7bfSJunchao Zhang #endif 10314e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 10324e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 1033087f3262SPaul Mullowney 1034087f3262SPaul Mullowney /* get the triangular factors */ 10359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 10363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1037087f3262SPaul Mullowney } 10389ae82921SPaul Mullowney 1039b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1040d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1041d71ae5a4SJacob Faibussowitsch { 1042bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1043aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1044aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1045da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1046da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1047aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1048aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1049aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1050aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 1051b175d8bbSPaul Mullowney 1052bda325fcSPaul Mullowney PetscFunctionBegin; 1053aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 10549566063dSJacob Faibussowitsch PetscCall(PetscNew(&loTriFactorT)); 1055da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1056aa372e3fSPaul Mullowney 1057aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1058aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1059aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 10609371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1061aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1062aa372e3fSPaul Mullowney 1063aa372e3fSPaul Mullowney /* Create the matrix description */ 10649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 10659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 10669566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 10679566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 10689566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1069aa372e3fSPaul Mullowney 1070aa372e3fSPaul Mullowney /* set the operation */ 1071aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1072aa372e3fSPaul Mullowney 1073aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1074aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1075afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1076afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1077aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1078afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1079afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1080afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1081aa372e3fSPaul Mullowney 1082aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1083afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10849371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 10859371c9d4SSatish Balay loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 10869371c9d4SSatish Balay loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 10879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1088afb2bd1cSJunchao Zhang #endif 1089afb2bd1cSJunchao Zhang 10909566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 10919f7ba44dSJacob Faibussowitsch { 10929f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 10939f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 10949371c9d4SSatish Balay loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1095afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 10969f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1097afb2bd1cSJunchao Zhang #else 10989f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1099afb2bd1cSJunchao Zhang #endif 11009f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11019f7ba44dSJacob Faibussowitsch } 11029f7ba44dSJacob Faibussowitsch 11039566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11049566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1105aa372e3fSPaul Mullowney 1106afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11079566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1108261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 11091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11109371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11119371c9d4SSatish Balay loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 11129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1113afb2bd1cSJunchao Zhang #endif 1114afb2bd1cSJunchao Zhang 1115afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11169371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 11179f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 11189f7ba44dSJacob Faibussowitsch 11199566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11209566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1121aa372e3fSPaul Mullowney 1122da79fbbcSStefano Zampini /* assign the pointer */ 1123aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1124aa372e3fSPaul Mullowney 1125aa372e3fSPaul Mullowney /*********************************************/ 1126aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1127aa372e3fSPaul Mullowney /*********************************************/ 1128aa372e3fSPaul Mullowney 1129aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 11309566063dSJacob Faibussowitsch PetscCall(PetscNew(&upTriFactorT)); 1131da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1132aa372e3fSPaul Mullowney 1133aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1134aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1135aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 11369371c9d4SSatish Balay fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1137aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1138aa372e3fSPaul Mullowney 1139aa372e3fSPaul Mullowney /* Create the matrix description */ 11409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 11419566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 11429566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 11439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 11449566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1145aa372e3fSPaul Mullowney 1146aa372e3fSPaul Mullowney /* set the operation */ 1147aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1148aa372e3fSPaul Mullowney 1149aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1150aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1151afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1152afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1153aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1154afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1155afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1156afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1157aa372e3fSPaul Mullowney 1158aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1159afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11609371c9d4SSatish Balay PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 11619371c9d4SSatish Balay upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 11629371c9d4SSatish Balay upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 11639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1164afb2bd1cSJunchao Zhang #endif 1165afb2bd1cSJunchao Zhang 11669566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 11679f7ba44dSJacob Faibussowitsch { 11689f7ba44dSJacob Faibussowitsch // there is no clean way to have PetscCallCUSPARSE wrapping this function... 11699f7ba44dSJacob Faibussowitsch auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 11709371c9d4SSatish Balay upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1171afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 11729f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1173afb2bd1cSJunchao Zhang #else 11749f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1175afb2bd1cSJunchao Zhang #endif 11769f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(stat); 11779f7ba44dSJacob Faibussowitsch } 1178d49cd2b7SBarry Smith 11799566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11809566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1181aa372e3fSPaul Mullowney 1182afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 11839566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1184261a78b4SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 11851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 11869371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 11879371c9d4SSatish Balay upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 11889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1189afb2bd1cSJunchao Zhang #endif 1190afb2bd1cSJunchao Zhang 1191afb2bd1cSJunchao Zhang /* perform the solve analysis */ 11925f80ce2aSJacob Faibussowitsch /* christ, would it have killed you to put this stuff in a function????????? */ 11939371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 11949f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1195d49cd2b7SBarry Smith 11969566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 11979566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1198aa372e3fSPaul Mullowney 1199da79fbbcSStefano Zampini /* assign the pointer */ 1200aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 12013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1202bda325fcSPaul Mullowney } 1203d460d7bfSJunchao Zhang #endif 1204bda325fcSPaul Mullowney 12059371c9d4SSatish Balay struct PetscScalarToPetscInt { 12069371c9d4SSatish Balay __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1207a49f1ed0SStefano Zampini }; 1208a49f1ed0SStefano Zampini 1209d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1210d71ae5a4SJacob Faibussowitsch { 1211aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1212a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1213bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1214bda325fcSPaul Mullowney cusparseStatus_t stat; 1215aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1216b175d8bbSPaul Mullowney 1217bda325fcSPaul Mullowney PetscFunctionBegin; 12189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1219a49f1ed0SStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 122028b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1221a49f1ed0SStefano Zampini matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 122208401ef6SPierre Jolivet PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 12233ba16761SJacob Faibussowitsch if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 12249566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 12259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 122648a46eb9SPierre Jolivet if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1227a49f1ed0SStefano Zampini if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1228aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 12299566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1230aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 12319566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 12329566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1233aa372e3fSPaul Mullowney 1234b06137fdSPaul Mullowney /* set alpha and beta */ 1235f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1236f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1237f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 12389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 12409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1241b06137fdSPaul Mullowney 1242aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1243aa372e3fSPaul Mullowney CsrMatrix *matrixT = new CsrMatrix; 1244a49f1ed0SStefano Zampini matstructT->mat = matrixT; 1245554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1246554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1247aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1248a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1249aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1250aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1251a3fdcf43SKarl Rupp 1252ad540459SPierre Jolivet if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 125381902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1254afb2bd1cSJunchao Zhang 1255afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 12563606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 12579371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 12589371c9d4SSatish Balay indexBase, cusparse_scalartype); 12599371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12603606e59fSJunchao Zhang #else 12613606e59fSJunchao Zhang /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 12623606e59fSJunchao Zhang see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 12633606e59fSJunchao Zhang 12643606e59fSJunchao Zhang I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 12653606e59fSJunchao Zhang it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 12663606e59fSJunchao Zhang when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 12673606e59fSJunchao Zhang */ 12683606e59fSJunchao Zhang if (matrixT->num_entries) { 12699371c9d4SSatish Balay stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 12709371c9d4SSatish Balay PetscCallCUSPARSE(stat); 12713606e59fSJunchao Zhang 12723606e59fSJunchao Zhang } else { 12733606e59fSJunchao Zhang matstructT->matDescr = NULL; 12743606e59fSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 12753606e59fSJunchao Zhang } 12763606e59fSJunchao Zhang #endif 1277afb2bd1cSJunchao Zhang #endif 1278aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1280afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281afb2bd1cSJunchao Zhang #else 1282aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 128351c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 128451c6d536SStefano Zampini /* First convert HYB to CSR */ 1285aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1286aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1287aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1288aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1289aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1291aa372e3fSPaul Mullowney 12929371c9d4SSatish Balay stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 12939371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1294aa372e3fSPaul Mullowney 1295aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1297aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1298aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1299aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1300aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1302aa372e3fSPaul Mullowney 13039371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 13049371c9d4SSatish Balay tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13059371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1306aa372e3fSPaul Mullowney 1307aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1308aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 13099566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 13109371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 13119371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 13129371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1313aa372e3fSPaul Mullowney 1314aa372e3fSPaul Mullowney /* assign the pointer */ 1315aa372e3fSPaul Mullowney matstructT->mat = hybMat; 13161a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 1317aa372e3fSPaul Mullowney /* delete temporaries */ 1318aa372e3fSPaul Mullowney if (tempT) { 1319aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1320aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1321aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1322aa372e3fSPaul Mullowney delete (CsrMatrix *)tempT; 1323087f3262SPaul Mullowney } 1324aa372e3fSPaul Mullowney if (temp) { 1325aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY *)temp->values; 1326aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1327aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1328aa372e3fSPaul Mullowney delete (CsrMatrix *)temp; 1329aa372e3fSPaul Mullowney } 1330afb2bd1cSJunchao Zhang #endif 1331aa372e3fSPaul Mullowney } 1332a49f1ed0SStefano Zampini } 1333a49f1ed0SStefano Zampini if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1334a49f1ed0SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1335a49f1ed0SStefano Zampini CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 133628b400f6SJacob Faibussowitsch PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 133728b400f6SJacob Faibussowitsch PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 133828b400f6SJacob Faibussowitsch PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 133928b400f6SJacob Faibussowitsch PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 134028b400f6SJacob Faibussowitsch PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 134128b400f6SJacob Faibussowitsch PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 134228b400f6SJacob Faibussowitsch PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 134328b400f6SJacob Faibussowitsch PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1344a49f1ed0SStefano Zampini if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1345a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1346a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 13479566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1348a49f1ed0SStefano Zampini } 1349a49f1ed0SStefano Zampini if (!cusparsestruct->csr2csc_i) { 1350a49f1ed0SStefano Zampini THRUSTARRAY csr2csc_a(matrix->num_entries); 1351792fecdfSBarry Smith PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1352a49f1ed0SStefano Zampini 1353a49f1ed0SStefano Zampini indexBase = cusparseGetMatIndexBase(matstruct->descr); 1354a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1355a49f1ed0SStefano Zampini void *csr2cscBuffer; 1356a49f1ed0SStefano Zampini size_t csr2cscBufferSize; 13579371c9d4SSatish Balay stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 13589371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 13599371c9d4SSatish Balay PetscCallCUSPARSE(stat); 13609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1361a49f1ed0SStefano Zampini #endif 1362a49f1ed0SStefano Zampini 13631a2c6b5cSJunchao Zhang if (matrix->num_entries) { 13641a2c6b5cSJunchao Zhang /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 13651a2c6b5cSJunchao Zhang mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 13661a2c6b5cSJunchao Zhang I checked every parameters and they were just fine. I have no clue why cusparse complains. 13671a2c6b5cSJunchao Zhang 13681a2c6b5cSJunchao Zhang Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 13691a2c6b5cSJunchao Zhang should be filled with indexBase. So I just take a shortcut here. 13701a2c6b5cSJunchao Zhang */ 13719371c9d4SSatish Balay stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1372a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13739371c9d4SSatish Balay matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 13749371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1375a49f1ed0SStefano Zampini #else 13769371c9d4SSatish Balay matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 13779371c9d4SSatish Balay PetscCallCUSPARSE(stat); 1378a49f1ed0SStefano Zampini #endif 13791a2c6b5cSJunchao Zhang } else { 13801a2c6b5cSJunchao Zhang matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 13811a2c6b5cSJunchao Zhang } 13821a2c6b5cSJunchao Zhang 1383a49f1ed0SStefano Zampini cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384792fecdfSBarry Smith PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1385a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 13869566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(csr2cscBuffer)); 1387a49f1ed0SStefano Zampini #endif 1388a49f1ed0SStefano Zampini } 13899371c9d4SSatish Balay PetscCallThrust( 13909371c9d4SSatish Balay thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1391a49f1ed0SStefano Zampini } 13929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13939566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1394213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1395213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1396aa372e3fSPaul Mullowney /* assign the pointer */ 1397aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 13981a2c6b5cSJunchao Zhang A->transupdated = PETSC_TRUE; 13993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1400bda325fcSPaul Mullowney } 1401bda325fcSPaul Mullowney 1402b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1403d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1404d460d7bfSJunchao Zhang { 1405d460d7bfSJunchao Zhang const PetscScalar *barray; 1406d460d7bfSJunchao Zhang PetscScalar *xarray; 1407d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1408d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1409d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1410d460d7bfSJunchao Zhang const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1411d460d7bfSJunchao Zhang const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1412d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1413d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1414d460d7bfSJunchao Zhang 1415d460d7bfSJunchao Zhang PetscFunctionBegin; 1416d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1417d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1418d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1419d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1420d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1421d460d7bfSJunchao Zhang 1422d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1423d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1424d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1425d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1426d460d7bfSJunchao Zhang } else { 1427d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1428d460d7bfSJunchao Zhang } 1429d460d7bfSJunchao Zhang 1430d460d7bfSJunchao Zhang // Solve L Y = X 1431d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1432d460d7bfSJunchao Zhang // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1433d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1434d460d7bfSJunchao Zhang 1435d460d7bfSJunchao Zhang // Solve U X = Y 1436d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1437d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1438d460d7bfSJunchao Zhang } else { 1439d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1440d460d7bfSJunchao Zhang } 1441d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1442d460d7bfSJunchao Zhang 1443d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1444d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1445d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1446d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1447d460d7bfSJunchao Zhang } 1448d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1451d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1452d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1453d460d7bfSJunchao Zhang } 1454d460d7bfSJunchao Zhang 1455d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1456d460d7bfSJunchao Zhang { 1457d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1458d460d7bfSJunchao Zhang Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1459d460d7bfSJunchao Zhang const PetscScalar *barray; 1460d460d7bfSJunchao Zhang PetscScalar *xarray; 1461d460d7bfSJunchao Zhang thrust::device_ptr<const PetscScalar> bGPU; 1462d460d7bfSJunchao Zhang thrust::device_ptr<PetscScalar> xGPU; 1463d460d7bfSJunchao Zhang const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1464d460d7bfSJunchao Zhang const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1465d460d7bfSJunchao Zhang PetscInt m = A->rmap->n; 1466d460d7bfSJunchao Zhang 1467d460d7bfSJunchao Zhang PetscFunctionBegin; 1468d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1469d460d7bfSJunchao Zhang if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1470d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1471d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1472d460d7bfSJunchao Zhang fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1473d460d7bfSJunchao Zhang 1474d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1475d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1476d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1477d460d7bfSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1478d460d7bfSJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_TRUE; 1479d460d7bfSJunchao Zhang } 1480d460d7bfSJunchao Zhang 1481d460d7bfSJunchao Zhang if (!fs->updatedTransposeSpSVAnalysis) { 1482d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1483d460d7bfSJunchao Zhang 1484d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1485d460d7bfSJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1486d460d7bfSJunchao Zhang } 1487d460d7bfSJunchao Zhang 1488d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1489d460d7bfSJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1490d460d7bfSJunchao Zhang xGPU = thrust::device_pointer_cast(xarray); 1491d460d7bfSJunchao Zhang bGPU = thrust::device_pointer_cast(barray); 1492d460d7bfSJunchao Zhang 1493d460d7bfSJunchao Zhang // Reorder b with the row permutation if needed, and wrap the result in fs->X 1494d460d7bfSJunchao Zhang if (fs->rpermIndices) { 1495d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1496d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1497d460d7bfSJunchao Zhang } else { 1498d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1499d460d7bfSJunchao Zhang } 1500d460d7bfSJunchao Zhang 1501d460d7bfSJunchao Zhang // Solve Ut Y = X 1502d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1503d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1504d460d7bfSJunchao Zhang 1505d460d7bfSJunchao Zhang // Solve Lt X = Y 1506d460d7bfSJunchao Zhang if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1507d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1508d460d7bfSJunchao Zhang } else { 1509d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1510d460d7bfSJunchao Zhang } 1511d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1512d460d7bfSJunchao Zhang 1513d460d7bfSJunchao Zhang // Reorder X with the column permutation if needed, and put the result back to x 1514d460d7bfSJunchao Zhang if (fs->cpermIndices) { 1515d460d7bfSJunchao Zhang PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1516d460d7bfSJunchao Zhang thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1517d460d7bfSJunchao Zhang } 1518d460d7bfSJunchao Zhang 1519d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1520d460d7bfSJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1521d460d7bfSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1522d460d7bfSJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1523d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1524d460d7bfSJunchao Zhang } 1525d460d7bfSJunchao Zhang #else 1526a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1527d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1528d71ae5a4SJacob Faibussowitsch { 1529c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1530465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1531465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1532465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1533465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1534bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1535aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1536aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1537aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1538bda325fcSPaul Mullowney 1539bda325fcSPaul Mullowney PetscFunctionBegin; 1540aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1541aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1543aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545bda325fcSPaul Mullowney } 1546bda325fcSPaul Mullowney 1547bda325fcSPaul Mullowney /* Get the GPU pointers */ 15489566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 15499566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1550c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1551c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1552bda325fcSPaul Mullowney 15539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1554aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 15559371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1556aa372e3fSPaul Mullowney 1557aa372e3fSPaul Mullowney /* First, solve U */ 15589f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 15599f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1560aa372e3fSPaul Mullowney 1561aa372e3fSPaul Mullowney /* Then, solve L */ 15629f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 15639f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1564aa372e3fSPaul Mullowney 1565aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 15669371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1567aa372e3fSPaul Mullowney 1568aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1569a0e72f99SJunchao Zhang thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1570bda325fcSPaul Mullowney 1571bda325fcSPaul Mullowney /* restore */ 15729566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 15739566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 15749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 15763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1577bda325fcSPaul Mullowney } 1578bda325fcSPaul Mullowney 1579d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1580d71ae5a4SJacob Faibussowitsch { 1581465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1582465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1583bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1584aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1585aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1586aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1587bda325fcSPaul Mullowney 1588bda325fcSPaul Mullowney PetscFunctionBegin; 1589aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1590aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 15919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1592aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594bda325fcSPaul Mullowney } 1595bda325fcSPaul Mullowney 1596bda325fcSPaul Mullowney /* Get the GPU pointers */ 15979566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 15989566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1599bda325fcSPaul Mullowney 16009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1601aa372e3fSPaul Mullowney /* First, solve U */ 16029f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 16039f7ba44dSJacob Faibussowitsch upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1604aa372e3fSPaul Mullowney 1605aa372e3fSPaul Mullowney /* Then, solve L */ 16069f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 16079f7ba44dSJacob Faibussowitsch loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1608bda325fcSPaul Mullowney 1609bda325fcSPaul Mullowney /* restore */ 16109566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16119566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1615bda325fcSPaul Mullowney } 1616bda325fcSPaul Mullowney 1617d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1618d71ae5a4SJacob Faibussowitsch { 1619465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1620465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1621465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1622465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 16239ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1624aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1625aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1626aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16279ae82921SPaul Mullowney 16289ae82921SPaul Mullowney PetscFunctionBegin; 1629e057df02SPaul Mullowney /* Get the GPU pointers */ 16309566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16319566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1632c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1633c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 16349ae82921SPaul Mullowney 16359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1636aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 16379371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1638aa372e3fSPaul Mullowney 1639aa372e3fSPaul Mullowney /* Next, solve L */ 16409f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16419f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1642aa372e3fSPaul Mullowney 1643aa372e3fSPaul Mullowney /* Then, solve U */ 16449f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16459f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1646d49cd2b7SBarry Smith 16474e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 16489371c9d4SSatish Balay thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 16499ae82921SPaul Mullowney 16509566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16519566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16559ae82921SPaul Mullowney } 16569ae82921SPaul Mullowney 1657d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1658d71ae5a4SJacob Faibussowitsch { 1659465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1660465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 16619ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1662aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1663aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1664aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 16659ae82921SPaul Mullowney 16669ae82921SPaul Mullowney PetscFunctionBegin; 1667e057df02SPaul Mullowney /* Get the GPU pointers */ 16689566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 16699566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(bb, &barray)); 16709ae82921SPaul Mullowney 16719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1672aa372e3fSPaul Mullowney /* First, solve L */ 16739f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 16749f7ba44dSJacob Faibussowitsch loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1675d49cd2b7SBarry Smith 1676aa372e3fSPaul Mullowney /* Next, solve U */ 16779f7ba44dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 16789f7ba44dSJacob Faibussowitsch upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 16799ae82921SPaul Mullowney 16809566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 16819566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 16829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 16843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16859ae82921SPaul Mullowney } 1686d460d7bfSJunchao Zhang #endif 16879ae82921SPaul Mullowney 1688b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 16898eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1690d71ae5a4SJacob Faibussowitsch { 1691da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1692da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1693da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1694da112707SJunchao Zhang CsrMatrix *Acsr; 1695da112707SJunchao Zhang PetscInt m, nz; 1696da112707SJunchao Zhang PetscBool flg; 1697da112707SJunchao Zhang 1698da112707SJunchao Zhang PetscFunctionBegin; 1699da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1700da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1701da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1702da112707SJunchao Zhang } 1703da112707SJunchao Zhang 1704da112707SJunchao Zhang /* Copy A's value to fact */ 1705da112707SJunchao Zhang m = fact->rmap->n; 1706da112707SJunchao Zhang nz = aij->nz; 1707da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1708da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1709da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1710da112707SJunchao Zhang 1711bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeBegin()); 1712da112707SJunchao Zhang /* Factorize fact inplace */ 17139371c9d4SSatish Balay if (m) 17149371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1715d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1716da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1717da112707SJunchao Zhang int numerical_zero; 1718da112707SJunchao Zhang cusparseStatus_t status; 1719da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1720da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1721da112707SJunchao Zhang } 1722da112707SJunchao Zhang 1723204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1724204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 1725204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1726204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1727204a0e31SJunchao Zhang } else 1728204a0e31SJunchao Zhang #endif 1729204a0e31SJunchao Zhang { 173012ba2bc6SJunchao Zhang /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 173112ba2bc6SJunchao Zhang See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 173212ba2bc6SJunchao Zhang */ 17339371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734da112707SJunchao Zhang 17359371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736da112707SJunchao Zhang 1737204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 173812ba2bc6SJunchao Zhang /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 173912ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740204a0e31SJunchao Zhang } 174112ba2bc6SJunchao Zhang 1742da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1743d460d7bfSJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1744d460d7bfSJunchao Zhang fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1745da112707SJunchao Zhang fact->ops->matsolve = NULL; 1746da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1747bdb0d812SBarry Smith PetscCall(PetscLogGpuTimeEnd()); 1748da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 17493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1750da112707SJunchao Zhang } 1751da112707SJunchao Zhang 17528eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1753d71ae5a4SJacob Faibussowitsch { 1754da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756da112707SJunchao Zhang PetscInt m, nz; 1757da112707SJunchao Zhang 1758da112707SJunchao Zhang PetscFunctionBegin; 1759da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1760da112707SJunchao Zhang PetscInt i; 1761da112707SJunchao Zhang PetscBool flg, missing; 1762da112707SJunchao Zhang 1763da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1764da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1765da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1766da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 1767da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1768da112707SJunchao Zhang } 1769da112707SJunchao Zhang 1770da112707SJunchao Zhang /* Free the old stale stuff */ 1771da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1772da112707SJunchao Zhang 1773da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1774da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 1775da112707SJunchao Zhang */ 1776da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1777da112707SJunchao Zhang 1778da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 1779da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ILU; 1780da112707SJunchao Zhang fact->info.factor_mallocs = 0; 1781da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 1782da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 1783da112707SJunchao Zhang 1784da112707SJunchao Zhang aij->row = NULL; 1785da112707SJunchao Zhang aij->col = NULL; 1786da112707SJunchao Zhang 1787da112707SJunchao Zhang /* ====================================================================== */ 1788da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 1789da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 1790da112707SJunchao Zhang /* ====================================================================== */ 1791da112707SJunchao Zhang const int *Ai, *Aj; 1792da112707SJunchao Zhang 1793da112707SJunchao Zhang m = fact->rmap->n; 1794da112707SJunchao Zhang nz = aij->nz; 1795da112707SJunchao Zhang 1796f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1797f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1798f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1799d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1800d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1802da112707SJunchao Zhang 1803da112707SJunchao Zhang /* ====================================================================== */ 1804da112707SJunchao Zhang /* Create descriptors for M, L, U */ 1805da112707SJunchao Zhang /* ====================================================================== */ 1806da112707SJunchao Zhang cusparseFillMode_t fillMode; 1807da112707SJunchao Zhang cusparseDiagType_t diagType; 1808da112707SJunchao Zhang 1809da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1810da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1811da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1812da112707SJunchao Zhang 1813da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1814da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1815da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1816da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1817da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1818da112707SJunchao Zhang */ 1819da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 1820da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_UNIT; 1821d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18229371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18239371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1824da112707SJunchao Zhang 1825da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_UPPER; 1826da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1827d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 18289371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 18299371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1830da112707SJunchao Zhang 1831da112707SJunchao Zhang /* ========================================================================= */ 1832da112707SJunchao Zhang /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1833da112707SJunchao Zhang /* ========================================================================= */ 1834da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 18359371c9d4SSatish Balay if (m) 18369371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1837d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1838da112707SJunchao Zhang 1839da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1840da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1841da112707SJunchao Zhang 1842da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1843da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1844da112707SJunchao Zhang 1845da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 18469371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1847da112707SJunchao Zhang 1848da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 18499371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1850da112707SJunchao Zhang 1851da112707SJunchao Zhang /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 185212ba2bc6SJunchao Zhang and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 185312ba2bc6SJunchao Zhang spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 185412ba2bc6SJunchao Zhang To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1855da112707SJunchao Zhang */ 185612ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 185712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 185812ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 1859da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 186012ba2bc6SJunchao Zhang } else { 186112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 186212ba2bc6SJunchao Zhang fs->spsvBuffer_U = fs->factBuffer_M; 1863da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 186412ba2bc6SJunchao Zhang } 1865da112707SJunchao Zhang 1866da112707SJunchao Zhang /* ========================================================================== */ 1867da112707SJunchao Zhang /* Perform analysis of ilu0 on M, SpSv on L and U */ 1868da112707SJunchao Zhang /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1869da112707SJunchao Zhang /* ========================================================================== */ 1870da112707SJunchao Zhang int structural_zero; 1871da112707SJunchao Zhang cusparseStatus_t status; 1872da112707SJunchao Zhang 1873da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 18749371c9d4SSatish Balay if (m) 18759371c9d4SSatish Balay PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1876d460d7bfSJunchao Zhang fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1877da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 187846aba097SBarry Smith /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1879da112707SJunchao Zhang status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1880da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1881da112707SJunchao Zhang } 1882da112707SJunchao Zhang 1883da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 18840dd8c0acSJunchao Zhang { 1885da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 18860dd8c0acSJunchao Zhang PetscInt *Ai, *Adiag, nzRow, nzLeft; 1887da112707SJunchao Zhang PetscLogDouble flops = 0.0; 1888da112707SJunchao Zhang 1889da112707SJunchao Zhang PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1890da112707SJunchao Zhang Ai = Aseq->i; 1891da112707SJunchao Zhang Adiag = Aseq->diag; 1892da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1893da112707SJunchao Zhang if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1894da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 1895da112707SJunchao Zhang nzLeft = Adiag[i] - Ai[i]; 1896da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1897da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 1898da112707SJunchao Zhang */ 1899da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 1900da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1901da112707SJunchao Zhang } 1902da112707SJunchao Zhang } 1903da112707SJunchao Zhang fs->numericFactFlops = flops; 19040dd8c0acSJunchao Zhang } 1905da112707SJunchao Zhang fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 19063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1907da112707SJunchao Zhang } 1908da112707SJunchao Zhang 1909d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1910d71ae5a4SJacob Faibussowitsch { 1911da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1912da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1913da112707SJunchao Zhang const PetscScalar *barray; 1914da112707SJunchao Zhang PetscScalar *xarray; 1915da112707SJunchao Zhang 1916da112707SJunchao Zhang PetscFunctionBegin; 1917da112707SJunchao Zhang PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1918da112707SJunchao Zhang PetscCall(VecCUDAGetArrayRead(b, &barray)); 1919da112707SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1920da112707SJunchao Zhang 1921da112707SJunchao Zhang /* Solve L*y = b */ 1922da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1923da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 19249371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 19259371c9d4SSatish Balay fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1926da112707SJunchao Zhang 1927da112707SJunchao Zhang /* Solve Lt*x = y */ 1928da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 19299371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 19309371c9d4SSatish Balay fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1931da112707SJunchao Zhang 1932da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1933da112707SJunchao Zhang PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1934da112707SJunchao Zhang 1935da112707SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1936da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 19373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1938da112707SJunchao Zhang } 1939da112707SJunchao Zhang 19408eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1941d71ae5a4SJacob Faibussowitsch { 1942da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1943da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1944da112707SJunchao Zhang Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1945da112707SJunchao Zhang CsrMatrix *Acsr; 1946da112707SJunchao Zhang PetscInt m, nz; 1947da112707SJunchao Zhang PetscBool flg; 1948da112707SJunchao Zhang 1949da112707SJunchao Zhang PetscFunctionBegin; 1950da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1951da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1952da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1953da112707SJunchao Zhang } 1954da112707SJunchao Zhang 1955da112707SJunchao Zhang /* Copy A's value to fact */ 1956da112707SJunchao Zhang m = fact->rmap->n; 1957da112707SJunchao Zhang nz = aij->nz; 1958da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1959da112707SJunchao Zhang Acsr = (CsrMatrix *)Acusp->mat->mat; 1960da112707SJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1961da112707SJunchao Zhang 1962da112707SJunchao Zhang /* Factorize fact inplace */ 1963da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 196446aba097SBarry Smith csric02() only takes the lower triangular part of matrix A to perform factorization. 1965da112707SJunchao Zhang The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1966da112707SJunchao Zhang and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1967da112707SJunchao Zhang In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1968da112707SJunchao Zhang */ 1969d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1970da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 1971da112707SJunchao Zhang int numerical_zero; 1972da112707SJunchao Zhang cusparseStatus_t status; 1973da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1974da112707SJunchao Zhang PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1975da112707SJunchao Zhang } 1976da112707SJunchao Zhang 1977204a0e31SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1978204a0e31SJunchao Zhang if (fs->updatedSpSVAnalysis) { 1979204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1980204a0e31SJunchao Zhang if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1981204a0e31SJunchao Zhang } else 1982204a0e31SJunchao Zhang #endif 1983204a0e31SJunchao Zhang { 19849371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1985da112707SJunchao Zhang 1986da112707SJunchao Zhang /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1987da112707SJunchao Zhang ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1988da112707SJunchao Zhang */ 19899371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1990204a0e31SJunchao Zhang fs->updatedSpSVAnalysis = PETSC_TRUE; 1991204a0e31SJunchao Zhang } 1992da112707SJunchao Zhang 1993da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_GPU; 1994da112707SJunchao Zhang fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1995da112707SJunchao Zhang fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1996da112707SJunchao Zhang fact->ops->matsolve = NULL; 1997da112707SJunchao Zhang fact->ops->matsolvetranspose = NULL; 1998da112707SJunchao Zhang PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 19993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2000da112707SJunchao Zhang } 2001da112707SJunchao Zhang 20028eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2003d71ae5a4SJacob Faibussowitsch { 2004da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2005da112707SJunchao Zhang Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2006da112707SJunchao Zhang PetscInt m, nz; 2007da112707SJunchao Zhang 2008da112707SJunchao Zhang PetscFunctionBegin; 2009da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 2010da112707SJunchao Zhang PetscInt i; 2011da112707SJunchao Zhang PetscBool flg, missing; 2012da112707SJunchao Zhang 2013da112707SJunchao Zhang PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2014da112707SJunchao Zhang PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2015da112707SJunchao Zhang PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2016da112707SJunchao Zhang PetscCall(MatMissingDiagonal(A, &missing, &i)); 2017da112707SJunchao Zhang PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2018da112707SJunchao Zhang } 2019da112707SJunchao Zhang 2020da112707SJunchao Zhang /* Free the old stale stuff */ 2021da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2022da112707SJunchao Zhang 2023da112707SJunchao Zhang /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2024da112707SJunchao Zhang but they will not be used. Allocate them just for easy debugging. 2025da112707SJunchao Zhang */ 2026da112707SJunchao Zhang PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2027da112707SJunchao Zhang 2028da112707SJunchao Zhang fact->offloadmask = PETSC_OFFLOAD_BOTH; 2029da112707SJunchao Zhang fact->factortype = MAT_FACTOR_ICC; 2030da112707SJunchao Zhang fact->info.factor_mallocs = 0; 2031da112707SJunchao Zhang fact->info.fill_ratio_given = info->fill; 2032da112707SJunchao Zhang fact->info.fill_ratio_needed = 1.0; 2033da112707SJunchao Zhang 2034da112707SJunchao Zhang aij->row = NULL; 2035da112707SJunchao Zhang aij->col = NULL; 2036da112707SJunchao Zhang 2037da112707SJunchao Zhang /* ====================================================================== */ 2038da112707SJunchao Zhang /* Copy A's i, j to fact and also allocate the value array of fact. */ 2039da112707SJunchao Zhang /* We'll do in-place factorization on fact */ 2040da112707SJunchao Zhang /* ====================================================================== */ 2041da112707SJunchao Zhang const int *Ai, *Aj; 2042da112707SJunchao Zhang 2043da112707SJunchao Zhang m = fact->rmap->n; 2044da112707SJunchao Zhang nz = aij->nz; 2045da112707SJunchao Zhang 2046f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2047f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2048da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2049da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2050d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2051d460d7bfSJunchao Zhang PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2052da112707SJunchao Zhang 2053da112707SJunchao Zhang /* ====================================================================== */ 2054da112707SJunchao Zhang /* Create mat descriptors for M, L */ 2055da112707SJunchao Zhang /* ====================================================================== */ 2056da112707SJunchao Zhang cusparseFillMode_t fillMode; 2057da112707SJunchao Zhang cusparseDiagType_t diagType; 2058da112707SJunchao Zhang 2059da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2060da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2061da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2062da112707SJunchao Zhang 2063da112707SJunchao Zhang /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2064da112707SJunchao Zhang cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2065da112707SJunchao Zhang assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2066da112707SJunchao Zhang all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2067da112707SJunchao Zhang assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2068da112707SJunchao Zhang */ 2069da112707SJunchao Zhang fillMode = CUSPARSE_FILL_MODE_LOWER; 2070da112707SJunchao Zhang diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2071d460d7bfSJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 20729371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 20739371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2074da112707SJunchao Zhang 2075da112707SJunchao Zhang /* ========================================================================= */ 2076da112707SJunchao Zhang /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2077da112707SJunchao Zhang /* ========================================================================= */ 2078da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2079d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2080da112707SJunchao Zhang 2081da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2082da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2083da112707SJunchao Zhang 2084da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2085da112707SJunchao Zhang PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2086da112707SJunchao Zhang 2087da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 20889371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2089da112707SJunchao Zhang 2090da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 20919371c9d4SSatish Balay PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2092da112707SJunchao Zhang 209312ba2bc6SJunchao Zhang /* To save device memory, we make the factorization buffer share with one of the solver buffer. 209412ba2bc6SJunchao Zhang See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 209512ba2bc6SJunchao Zhang */ 209612ba2bc6SJunchao Zhang if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 209712ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 209812ba2bc6SJunchao Zhang fs->spsvBuffer_L = fs->factBuffer_M; 2099da112707SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 210012ba2bc6SJunchao Zhang } else { 210112ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 210212ba2bc6SJunchao Zhang fs->spsvBuffer_Lt = fs->factBuffer_M; 210312ba2bc6SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 210412ba2bc6SJunchao Zhang } 2105da112707SJunchao Zhang 2106da112707SJunchao Zhang /* ========================================================================== */ 2107da112707SJunchao Zhang /* Perform analysis of ic0 on M */ 2108da112707SJunchao Zhang /* The lower triangular part of M has the same sparsity pattern as L */ 2109da112707SJunchao Zhang /* ========================================================================== */ 2110da112707SJunchao Zhang int structural_zero; 2111da112707SJunchao Zhang cusparseStatus_t status; 2112da112707SJunchao Zhang 2113da112707SJunchao Zhang fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2114d460d7bfSJunchao Zhang if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2115da112707SJunchao Zhang if (PetscDefined(USE_DEBUG)) { 211646aba097SBarry Smith /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2117da112707SJunchao Zhang status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2118da112707SJunchao Zhang PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2119da112707SJunchao Zhang } 2120da112707SJunchao Zhang 2121da112707SJunchao Zhang /* Estimate FLOPs of the numeric factorization */ 21220dd8c0acSJunchao Zhang { 2123da112707SJunchao Zhang Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 21240dd8c0acSJunchao Zhang PetscInt *Ai, nzRow, nzLeft; 2125da112707SJunchao Zhang PetscLogDouble flops = 0.0; 2126da112707SJunchao Zhang 2127da112707SJunchao Zhang Ai = Aseq->i; 2128da112707SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2129da112707SJunchao Zhang nzRow = Ai[i + 1] - Ai[i]; 2130da112707SJunchao Zhang if (nzRow > 1) { 2131da112707SJunchao Zhang /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2132da112707SJunchao Zhang and include the eliminated one will be updated, which incurs a multiplication and an addition. 2133da112707SJunchao Zhang */ 2134da112707SJunchao Zhang nzLeft = (nzRow - 1) / 2; 2135da112707SJunchao Zhang flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2136da112707SJunchao Zhang } 2137da112707SJunchao Zhang } 2138da112707SJunchao Zhang fs->numericFactFlops = flops; 21390dd8c0acSJunchao Zhang } 2140da112707SJunchao Zhang fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 21413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2142da112707SJunchao Zhang } 2143da112707SJunchao Zhang #endif 2144da112707SJunchao Zhang 2145d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2146d460d7bfSJunchao Zhang { 2147b820271fSJunchao Zhang // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2148b820271fSJunchao Zhang Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2149d460d7bfSJunchao Zhang 2150d460d7bfSJunchao Zhang PetscFunctionBegin; 2151d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2152d460d7bfSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2153d460d7bfSJunchao Zhang B->offloadmask = PETSC_OFFLOAD_CPU; 2154d460d7bfSJunchao Zhang 2155d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) { 2156b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2157d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2158d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2159d460d7bfSJunchao Zhang #else 2160d460d7bfSJunchao Zhang /* determine which version of MatSolve needs to be used. */ 2161d460d7bfSJunchao Zhang Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2162d460d7bfSJunchao Zhang IS isrow = b->row, iscol = b->col; 2163d460d7bfSJunchao Zhang PetscBool row_identity, col_identity; 2164d460d7bfSJunchao Zhang 2165d460d7bfSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2166d460d7bfSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2167d460d7bfSJunchao Zhang if (row_identity && col_identity) { 2168d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2169d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2170d460d7bfSJunchao Zhang } else { 2171d460d7bfSJunchao Zhang B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2172d460d7bfSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2173d460d7bfSJunchao Zhang } 2174d460d7bfSJunchao Zhang #endif 2175d460d7bfSJunchao Zhang } 2176d460d7bfSJunchao Zhang B->ops->matsolve = NULL; 2177d460d7bfSJunchao Zhang B->ops->matsolvetranspose = NULL; 2178d460d7bfSJunchao Zhang 2179d460d7bfSJunchao Zhang /* get the triangular factors */ 2180d460d7bfSJunchao Zhang if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2181d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2182d460d7bfSJunchao Zhang } 2183d460d7bfSJunchao Zhang 2184d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185d460d7bfSJunchao Zhang { 2186d460d7bfSJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2187d460d7bfSJunchao Zhang 2188d460d7bfSJunchao Zhang PetscFunctionBegin; 2189d460d7bfSJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2190d460d7bfSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2191d460d7bfSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2192d460d7bfSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2193d460d7bfSJunchao Zhang } 2194d460d7bfSJunchao Zhang 2195d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2196d71ae5a4SJacob Faibussowitsch { 2197da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2198da112707SJunchao Zhang 2199da112707SJunchao Zhang PetscFunctionBegin; 2200b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2201bc996fdcSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2202f82ac72cSJunchao Zhang if (!info->factoronhost) { 2203da112707SJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2204da112707SJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2205bc996fdcSJunchao Zhang } 2206da112707SJunchao Zhang if (!info->levels && row_identity && col_identity) { 2207da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2208da112707SJunchao Zhang } else 2209da112707SJunchao Zhang #endif 2210da112707SJunchao Zhang { 2211da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2212da112707SJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2213da112707SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2214da112707SJunchao Zhang } 22153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2216da112707SJunchao Zhang } 2217da112707SJunchao Zhang 2218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2219d71ae5a4SJacob Faibussowitsch { 2220da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2221da112707SJunchao Zhang 2222da112707SJunchao Zhang PetscFunctionBegin; 2223b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2224bc996fdcSJunchao Zhang PetscBool perm_identity = PETSC_FALSE; 2225f82ac72cSJunchao Zhang if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2226da112707SJunchao Zhang if (!info->levels && perm_identity) { 2227da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2228da112707SJunchao Zhang } else 2229da112707SJunchao Zhang #endif 2230da112707SJunchao Zhang { 2231da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2232da112707SJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2233da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2234da112707SJunchao Zhang } 22353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2236da112707SJunchao Zhang } 2237da112707SJunchao Zhang 2238d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2239d71ae5a4SJacob Faibussowitsch { 2240da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2241da112707SJunchao Zhang 2242da112707SJunchao Zhang PetscFunctionBegin; 2243da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2244da112707SJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2245da112707SJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 22463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2247da112707SJunchao Zhang } 2248da112707SJunchao Zhang 224966976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2250d71ae5a4SJacob Faibussowitsch { 2251841d4cb1SJunchao Zhang PetscFunctionBegin; 2252841d4cb1SJunchao Zhang *type = MATSOLVERCUSPARSE; 22533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2254841d4cb1SJunchao Zhang } 2255841d4cb1SJunchao Zhang 2256841d4cb1SJunchao Zhang /*MC 2257841d4cb1SJunchao Zhang MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 225811a5261eSBarry Smith on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2259841d4cb1SJunchao Zhang algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2260841d4cb1SJunchao Zhang performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 226111a5261eSBarry Smith CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2262841d4cb1SJunchao Zhang algorithms are not recommended. This class does NOT support direct solver operations. 2263841d4cb1SJunchao Zhang 2264841d4cb1SJunchao Zhang Level: beginner 2265841d4cb1SJunchao Zhang 22661cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 22672ef1f0ffSBarry Smith `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2268841d4cb1SJunchao Zhang M*/ 2269841d4cb1SJunchao Zhang 2270d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2271d71ae5a4SJacob Faibussowitsch { 2272841d4cb1SJunchao Zhang PetscInt n = A->rmap->n; 2273841d4cb1SJunchao Zhang 2274841d4cb1SJunchao Zhang PetscFunctionBegin; 2275841d4cb1SJunchao Zhang PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2276841d4cb1SJunchao Zhang PetscCall(MatSetSizes(*B, n, n, n, n)); 2277b820271fSJunchao Zhang (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2278841d4cb1SJunchao Zhang PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2279841d4cb1SJunchao Zhang 2280841d4cb1SJunchao Zhang if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2281841d4cb1SJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2282841d4cb1SJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2283841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2284841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2285841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2286841d4cb1SJunchao Zhang } else { 2287841d4cb1SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2288841d4cb1SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2289841d4cb1SJunchao Zhang } 2290841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2291841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2292841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2293841d4cb1SJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2294841d4cb1SJunchao Zhang if (!A->boundtocpu) { 2295841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2296841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2297841d4cb1SJunchao Zhang } else { 2298841d4cb1SJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2299841d4cb1SJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2300841d4cb1SJunchao Zhang } 2301841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2302841d4cb1SJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2303841d4cb1SJunchao Zhang } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2304841d4cb1SJunchao Zhang 2305841d4cb1SJunchao Zhang PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2306841d4cb1SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2307f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 23083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2309841d4cb1SJunchao Zhang } 2310841d4cb1SJunchao Zhang 2311d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2312d71ae5a4SJacob Faibussowitsch { 23137e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 23147e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2315b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2316da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 23170dd8c0acSJunchao Zhang #endif 23187e8381f9SStefano Zampini 23197e8381f9SStefano Zampini PetscFunctionBegin; 23207e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 23219566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2322da112707SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 2323da112707SJunchao Zhang CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 23249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2325da112707SJunchao Zhang } 2326b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2327da112707SJunchao Zhang else if (fs->csrVal) { 2328da112707SJunchao Zhang /* We have a factorized matrix on device and are able to copy it to host */ 2329da112707SJunchao Zhang PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2330da112707SJunchao Zhang } 2331da112707SJunchao Zhang #endif 23329371c9d4SSatish Balay else 23339371c9d4SSatish Balay SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 23349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 23359566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 23367e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 23377e8381f9SStefano Zampini } 23383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23397e8381f9SStefano Zampini } 23407e8381f9SStefano Zampini 2341d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2342d71ae5a4SJacob Faibussowitsch { 23437e8381f9SStefano Zampini PetscFunctionBegin; 23449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 234567a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 234767a45760SJunchao Zhang } 234867a45760SJunchao Zhang 2349d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2350d71ae5a4SJacob Faibussowitsch { 235167a45760SJunchao Zhang PetscFunctionBegin; 23527e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 235367a45760SJunchao Zhang *array = NULL; 23543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 235567a45760SJunchao Zhang } 235667a45760SJunchao Zhang 2357d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2358d71ae5a4SJacob Faibussowitsch { 235967a45760SJunchao Zhang PetscFunctionBegin; 23609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 236167a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 236367a45760SJunchao Zhang } 236467a45760SJunchao Zhang 23658eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2366d71ae5a4SJacob Faibussowitsch { 236767a45760SJunchao Zhang PetscFunctionBegin; 236867a45760SJunchao Zhang *array = NULL; 23693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 237067a45760SJunchao Zhang } 237167a45760SJunchao Zhang 2372d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2373d71ae5a4SJacob Faibussowitsch { 237467a45760SJunchao Zhang PetscFunctionBegin; 237567a45760SJunchao Zhang *array = ((Mat_SeqAIJ *)A->data)->a; 23763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 237767a45760SJunchao Zhang } 237867a45760SJunchao Zhang 2379d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2380d71ae5a4SJacob Faibussowitsch { 238167a45760SJunchao Zhang PetscFunctionBegin; 238267a45760SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_CPU; 238367a45760SJunchao Zhang *array = NULL; 23843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 23857e8381f9SStefano Zampini } 23867e8381f9SStefano Zampini 2387d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2388d71ae5a4SJacob Faibussowitsch { 23897ee59b9bSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp; 23907ee59b9bSJunchao Zhang CsrMatrix *matrix; 23917ee59b9bSJunchao Zhang 23927ee59b9bSJunchao Zhang PetscFunctionBegin; 23937ee59b9bSJunchao Zhang PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 23947ee59b9bSJunchao Zhang PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 23957ee59b9bSJunchao Zhang cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 23967ee59b9bSJunchao Zhang PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 23977ee59b9bSJunchao Zhang matrix = (CsrMatrix *)cusp->mat->mat; 23987ee59b9bSJunchao Zhang 23997ee59b9bSJunchao Zhang if (i) { 24007ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24017ee59b9bSJunchao Zhang *i = matrix->row_offsets->data().get(); 24027ee59b9bSJunchao Zhang #else 24037ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24047ee59b9bSJunchao Zhang #endif 24057ee59b9bSJunchao Zhang } 24067ee59b9bSJunchao Zhang if (j) { 24077ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES) 24087ee59b9bSJunchao Zhang *j = matrix->column_indices->data().get(); 24097ee59b9bSJunchao Zhang #else 24107ee59b9bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 24117ee59b9bSJunchao Zhang #endif 24127ee59b9bSJunchao Zhang } 24137ee59b9bSJunchao Zhang if (a) *a = matrix->values->data().get(); 24147ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 24153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 24167ee59b9bSJunchao Zhang } 24177ee59b9bSJunchao Zhang 2418d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2419d71ae5a4SJacob Faibussowitsch { 2420aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 24217c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 24229ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2423213423ffSJunchao Zhang PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2424aa372e3fSPaul Mullowney cusparseStatus_t stat; 2425abb89eb1SStefano Zampini PetscBool both = PETSC_TRUE; 24269ae82921SPaul Mullowney 24279ae82921SPaul Mullowney PetscFunctionBegin; 242828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2429c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2430a49f1ed0SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2431a49f1ed0SStefano Zampini CsrMatrix *matrix; 2432afb2bd1cSJunchao Zhang matrix = (CsrMatrix *)cusparsestruct->mat->mat; 243385ba7357SStefano Zampini 243408401ef6SPierre Jolivet PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 24359566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2436afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a + a->nz); 24379566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 2438f4f49eeaSPierre Jolivet PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 24399566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 244134d6c7a5SJose E. Roman } else { 2442abb89eb1SStefano Zampini PetscInt nnz; 24439566063dSJacob Faibussowitsch PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 24449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 24459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 24467c700b8dSJunchao Zhang delete cusparsestruct->workVector; 244781902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 2448a49f1ed0SStefano Zampini cusparsestruct->workVector = NULL; 2449a49f1ed0SStefano Zampini cusparsestruct->rowoffsets_gpu = NULL; 24509ae82921SPaul Mullowney try { 24519ae82921SPaul Mullowney if (a->compressedrow.use) { 24529ae82921SPaul Mullowney m = a->compressedrow.nrows; 24539ae82921SPaul Mullowney ii = a->compressedrow.i; 24549ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 24559ae82921SPaul Mullowney } else { 2456213423ffSJunchao Zhang m = A->rmap->n; 2457213423ffSJunchao Zhang ii = a->i; 2458e6e9a74fSStefano Zampini ridx = NULL; 24599ae82921SPaul Mullowney } 246008401ef6SPierre Jolivet PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 24619371c9d4SSatish Balay if (!a->a) { 24629371c9d4SSatish Balay nnz = ii[m]; 24639371c9d4SSatish Balay both = PETSC_FALSE; 24649371c9d4SSatish Balay } else nnz = a->nz; 246508401ef6SPierre Jolivet PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 24669ae82921SPaul Mullowney 246785ba7357SStefano Zampini /* create cusparse matrix */ 2468abb89eb1SStefano Zampini cusparsestruct->nrows = m; 2469aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 24709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 24719566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 24729566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 24739ae82921SPaul Mullowney 2474f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2475f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2476f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 24779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 24809566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2481b06137fdSPaul Mullowney 2482aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2483aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2484aa372e3fSPaul Mullowney /* set the matrix */ 2485afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2486afb2bd1cSJunchao Zhang mat->num_rows = m; 2487afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2488abb89eb1SStefano Zampini mat->num_entries = nnz; 2489ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2490afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 24919ae82921SPaul Mullowney 2492ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2493abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2494aa372e3fSPaul Mullowney 2495ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2496abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2497aa372e3fSPaul Mullowney 2498aa372e3fSPaul Mullowney /* assign the pointer */ 2499afb2bd1cSJunchao Zhang matstruct->mat = mat; 2500afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2501afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 25029371c9d4SSatish Balay stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 25039371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 25049371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2505afb2bd1cSJunchao Zhang } 2506afb2bd1cSJunchao Zhang #endif 2507aa372e3fSPaul Mullowney } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2508afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2509afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2510afb2bd1cSJunchao Zhang #else 2511afb2bd1cSJunchao Zhang CsrMatrix *mat = new CsrMatrix; 2512afb2bd1cSJunchao Zhang mat->num_rows = m; 2513afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 2514abb89eb1SStefano Zampini mat->num_entries = nnz; 2515ee477ddbSJunchao Zhang PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2516afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m + 1); 2517aa372e3fSPaul Mullowney 2518ee477ddbSJunchao Zhang PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2519abb89eb1SStefano Zampini mat->column_indices->assign(a->j, a->j + nnz); 2520aa372e3fSPaul Mullowney 2521ee477ddbSJunchao Zhang PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2522abb89eb1SStefano Zampini if (a->a) mat->values->assign(a->a, a->a + nnz); 2523aa372e3fSPaul Mullowney 2524aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 25259566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 25269371c9d4SSatish Balay cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 25279371c9d4SSatish Balay stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 25289371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2529aa372e3fSPaul Mullowney /* assign the pointer */ 2530aa372e3fSPaul Mullowney matstruct->mat = hybMat; 2531aa372e3fSPaul Mullowney 2532afb2bd1cSJunchao Zhang if (mat) { 2533afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY *)mat->values; 2534afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2535afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2536afb2bd1cSJunchao Zhang delete (CsrMatrix *)mat; 2537087f3262SPaul Mullowney } 2538afb2bd1cSJunchao Zhang #endif 2539087f3262SPaul Mullowney } 2540ca45077fSPaul Mullowney 2541aa372e3fSPaul Mullowney /* assign the compressed row indices */ 2542213423ffSJunchao Zhang if (a->compressedrow.use) { 2543ee477ddbSJunchao Zhang PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2544ee477ddbSJunchao Zhang PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2545aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx, ridx + m); 2546213423ffSJunchao Zhang tmp = m; 2547213423ffSJunchao Zhang } else { 2548213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 2549213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 2550213423ffSJunchao Zhang tmp = 0; 2551213423ffSJunchao Zhang } 25529566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2553aa372e3fSPaul Mullowney 2554aa372e3fSPaul Mullowney /* assign the pointer */ 2555aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 2556d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 2557d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2558d71ae5a4SJacob Faibussowitsch } 25599566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 25609566063dSJacob Faibussowitsch PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 256134d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 256234d6c7a5SJose E. Roman } 2563abb89eb1SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 25649ae82921SPaul Mullowney } 25653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 25669ae82921SPaul Mullowney } 25679ae82921SPaul Mullowney 25689371c9d4SSatish Balay struct VecCUDAPlusEquals { 2569aa372e3fSPaul Mullowney template <typename Tuple> 2570d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2571d71ae5a4SJacob Faibussowitsch { 2572aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2573aa372e3fSPaul Mullowney } 2574aa372e3fSPaul Mullowney }; 2575aa372e3fSPaul Mullowney 25769371c9d4SSatish Balay struct VecCUDAEquals { 25777e8381f9SStefano Zampini template <typename Tuple> 2578d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2579d71ae5a4SJacob Faibussowitsch { 25807e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 25817e8381f9SStefano Zampini } 25827e8381f9SStefano Zampini }; 25837e8381f9SStefano Zampini 25849371c9d4SSatish Balay struct VecCUDAEqualsReverse { 2585e6e9a74fSStefano Zampini template <typename Tuple> 2586d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 2587d71ae5a4SJacob Faibussowitsch { 2588e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 2589e6e9a74fSStefano Zampini } 2590e6e9a74fSStefano Zampini }; 2591e6e9a74fSStefano Zampini 2592afb2bd1cSJunchao Zhang struct MatMatCusparse { 2593ccdfe979SStefano Zampini PetscBool cisdense; 2594ccdfe979SStefano Zampini PetscScalar *Bt; 2595ccdfe979SStefano Zampini Mat X; 2596fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2597fcdce8c4SStefano Zampini PetscLogDouble flops; 2598fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 2599b4285af6SJunchao Zhang 2600afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2601fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 2602afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2603afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 2604afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 2605afb2bd1cSJunchao Zhang PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2606b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2607b4285af6SJunchao Zhang void *dBuffer4; 2608b4285af6SJunchao Zhang void *dBuffer5; 2609b4285af6SJunchao Zhang #endif 2610fcdce8c4SStefano Zampini size_t mmBufferSize; 2611fcdce8c4SStefano Zampini void *mmBuffer; 2612fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2613fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 2614afb2bd1cSJunchao Zhang #endif 2615afb2bd1cSJunchao Zhang }; 2616ccdfe979SStefano Zampini 2617d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 2618d71ae5a4SJacob Faibussowitsch { 2619ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 2620ccdfe979SStefano Zampini 2621ccdfe979SStefano Zampini PetscFunctionBegin; 26229566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->Bt)); 2623fcdce8c4SStefano Zampini delete mmdata->Bcsr; 2624afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 26259566063dSJacob Faibussowitsch if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 26269566063dSJacob Faibussowitsch if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 26279566063dSJacob Faibussowitsch if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 26289566063dSJacob Faibussowitsch if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2629b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 26309566063dSJacob Faibussowitsch if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 26319566063dSJacob Faibussowitsch if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2632b4285af6SJunchao Zhang #endif 26339566063dSJacob Faibussowitsch if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 26349566063dSJacob Faibussowitsch if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2635afb2bd1cSJunchao Zhang #endif 26369566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mmdata->X)); 26379566063dSJacob Faibussowitsch PetscCall(PetscFree(data)); 26383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2639ccdfe979SStefano Zampini } 2640ccdfe979SStefano Zampini 26414742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2642ccdfe979SStefano Zampini 2643d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2644d71ae5a4SJacob Faibussowitsch { 2645ccdfe979SStefano Zampini Mat_Product *product = C->product; 2646ccdfe979SStefano Zampini Mat A, B; 2647afb2bd1cSJunchao Zhang PetscInt m, n, blda, clda; 2648ccdfe979SStefano Zampini PetscBool flg, biscuda; 2649ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2650ccdfe979SStefano Zampini cusparseStatus_t stat; 2651ccdfe979SStefano Zampini cusparseOperation_t opA; 2652ccdfe979SStefano Zampini const PetscScalar *barray; 2653ccdfe979SStefano Zampini PetscScalar *carray; 2654ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2655ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 2656ccdfe979SStefano Zampini CsrMatrix *csrmat; 2657ccdfe979SStefano Zampini 2658ccdfe979SStefano Zampini PetscFunctionBegin; 2659ccdfe979SStefano Zampini MatCheckProduct(C, 1); 266028b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2661ccdfe979SStefano Zampini mmdata = (MatMatCusparse *)product->data; 2662ccdfe979SStefano Zampini A = product->A; 2663ccdfe979SStefano Zampini B = product->B; 26649566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 266528b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2666ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 2667ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 266828b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 26699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2670ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2671ccdfe979SStefano Zampini switch (product->type) { 2672ccdfe979SStefano Zampini case MATPRODUCT_AB: 2673ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2674ccdfe979SStefano Zampini mat = cusp->mat; 2675ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2676ccdfe979SStefano Zampini m = A->rmap->n; 2677ccdfe979SStefano Zampini n = B->cmap->n; 2678ccdfe979SStefano Zampini break; 2679ccdfe979SStefano Zampini case MATPRODUCT_AtB: 26801a2c6b5cSJunchao Zhang if (!A->form_explicit_transpose) { 2681e6e9a74fSStefano Zampini mat = cusp->mat; 2682e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 2683e6e9a74fSStefano Zampini } else { 26849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2685ccdfe979SStefano Zampini mat = cusp->matTranspose; 2686ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2687e6e9a74fSStefano Zampini } 2688ccdfe979SStefano Zampini m = A->cmap->n; 2689ccdfe979SStefano Zampini n = B->cmap->n; 2690ccdfe979SStefano Zampini break; 2691ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2692ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2693ccdfe979SStefano Zampini mat = cusp->mat; 2694ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2695ccdfe979SStefano Zampini m = A->rmap->n; 2696ccdfe979SStefano Zampini n = B->rmap->n; 2697ccdfe979SStefano Zampini break; 2698d71ae5a4SJacob Faibussowitsch default: 2699d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2700ccdfe979SStefano Zampini } 270128b400f6SJacob Faibussowitsch PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2702ccdfe979SStefano Zampini csrmat = (CsrMatrix *)mat->mat; 2703ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 27049566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 27059566063dSJacob Faibussowitsch if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2706cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2707afb2bd1cSJunchao Zhang 27089566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(B, &blda)); 2709c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2710cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 27119566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2712c8378d12SStefano Zampini } else { 2713cd3f9d89SJunchao Zhang PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 27149566063dSJacob Faibussowitsch PetscCall(MatDenseGetLDA(C, &clda)); 2715c8378d12SStefano Zampini } 2716c8378d12SStefano Zampini 27179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2718afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2719afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2720fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2721fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2722fe5544b9SJunchao Zhang #else 2723fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matADescr = mat->matDescr; 2724fe5544b9SJunchao Zhang #endif 2725fe5544b9SJunchao Zhang 2726a5b23f4aSJose E. Roman /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2727afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2728fcdce8c4SStefano Zampini size_t mmBufferSize; 27299371c9d4SSatish Balay if (mmdata->initialized && mmdata->Blda != blda) { 27309371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 27319371c9d4SSatish Balay mmdata->matBDescr = NULL; 27329371c9d4SSatish Balay } 2733afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 27349566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2735afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2736afb2bd1cSJunchao Zhang } 2737c8378d12SStefano Zampini 27389371c9d4SSatish Balay if (mmdata->initialized && mmdata->Clda != clda) { 27399371c9d4SSatish Balay PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 27409371c9d4SSatish Balay mmdata->matCDescr = NULL; 27419371c9d4SSatish Balay } 2742afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 27439566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2744afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2745afb2bd1cSJunchao Zhang } 2746afb2bd1cSJunchao Zhang 2747fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2748fe5544b9SJunchao Zhang if (matADescr) { 274917f5f06fSJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2750fe5544b9SJunchao Zhang matADescr = NULL; 2751fe5544b9SJunchao Zhang } 2752fe5544b9SJunchao Zhang #endif 2753fe5544b9SJunchao Zhang 2754fe5544b9SJunchao Zhang if (!matADescr) { 2755fe5544b9SJunchao Zhang stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 27569371c9d4SSatish Balay CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 27579371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2758afb2bd1cSJunchao Zhang } 2759fe5544b9SJunchao Zhang 2760fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2761fe5544b9SJunchao Zhang 2762fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 27639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 27649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2765fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2766fcdce8c4SStefano Zampini } 2767fe5544b9SJunchao Zhang 2768f0b74427SPierre Jolivet #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2769fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2770fe5544b9SJunchao Zhang #endif 2771fe5544b9SJunchao Zhang 2772afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2773afb2bd1cSJunchao Zhang } else { 2774afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2775fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 27769566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 27779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2778afb2bd1cSJunchao Zhang } 2779afb2bd1cSJunchao Zhang 2780afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2781fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2782afb2bd1cSJunchao Zhang #else 2783afb2bd1cSJunchao Zhang PetscInt k; 2784afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2785ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2786ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2787ccdfe979SStefano Zampini cublasStatus_t cerr; 2788ccdfe979SStefano Zampini 27899566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 27909371c9d4SSatish Balay cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 27919371c9d4SSatish Balay PetscCallCUBLAS(cerr); 2792ccdfe979SStefano Zampini blda = B->cmap->n; 2793afb2bd1cSJunchao Zhang k = B->cmap->n; 2794afb2bd1cSJunchao Zhang } else { 2795afb2bd1cSJunchao Zhang k = B->rmap->n; 2796ccdfe979SStefano Zampini } 2797ccdfe979SStefano Zampini 2798afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 27999371c9d4SSatish Balay stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 28009371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2801afb2bd1cSJunchao Zhang #endif 28029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 28039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2804cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2805ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2806cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 28074742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2808ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2809cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 28104742e46bSJacob Faibussowitsch PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2811ccdfe979SStefano Zampini } else { 2812cd3f9d89SJunchao Zhang PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2813ccdfe979SStefano Zampini } 281448a46eb9SPierre Jolivet if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 281548a46eb9SPierre Jolivet if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 28163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2817ccdfe979SStefano Zampini } 2818ccdfe979SStefano Zampini 2819d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2820d71ae5a4SJacob Faibussowitsch { 2821ccdfe979SStefano Zampini Mat_Product *product = C->product; 2822ccdfe979SStefano Zampini Mat A, B; 2823ccdfe979SStefano Zampini PetscInt m, n; 2824ccdfe979SStefano Zampini PetscBool cisdense, flg; 2825ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2826ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2827ccdfe979SStefano Zampini 2828ccdfe979SStefano Zampini PetscFunctionBegin; 2829ccdfe979SStefano Zampini MatCheckProduct(C, 1); 283028b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2831ccdfe979SStefano Zampini A = product->A; 2832ccdfe979SStefano Zampini B = product->B; 28339566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 283428b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2835ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 283608401ef6SPierre Jolivet PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2837ccdfe979SStefano Zampini switch (product->type) { 2838ccdfe979SStefano Zampini case MATPRODUCT_AB: 2839ccdfe979SStefano Zampini m = A->rmap->n; 2840ccdfe979SStefano Zampini n = B->cmap->n; 28410e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2842ccdfe979SStefano Zampini break; 2843ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2844ccdfe979SStefano Zampini m = A->cmap->n; 2845ccdfe979SStefano Zampini n = B->cmap->n; 28460e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 28470e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2848ccdfe979SStefano Zampini break; 2849ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2850ccdfe979SStefano Zampini m = A->rmap->n; 2851ccdfe979SStefano Zampini n = B->rmap->n; 28520e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 28530e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2854ccdfe979SStefano Zampini break; 2855ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2856ccdfe979SStefano Zampini m = B->cmap->n; 2857ccdfe979SStefano Zampini n = B->cmap->n; 28580e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 28590e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2860ccdfe979SStefano Zampini break; 2861ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2862ccdfe979SStefano Zampini m = B->rmap->n; 2863ccdfe979SStefano Zampini n = B->rmap->n; 28640e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 28650e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2866ccdfe979SStefano Zampini break; 2867d71ae5a4SJacob Faibussowitsch default: 2868d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2869ccdfe979SStefano Zampini } 28709566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 2871ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 28729566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 28739566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2874ccdfe979SStefano Zampini 2875ccdfe979SStefano Zampini /* product data */ 28769566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 2877ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2878afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2879afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 288048a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2881afb2bd1cSJunchao Zhang #endif 2882ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2883ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 28849566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 28859566063dSJacob Faibussowitsch PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2886ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 28879566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2888ccdfe979SStefano Zampini } else { 28899566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2890ccdfe979SStefano Zampini } 2891ccdfe979SStefano Zampini } 2892ccdfe979SStefano Zampini C->product->data = mmdata; 2893ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2894ccdfe979SStefano Zampini 2895ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 28963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2897ccdfe979SStefano Zampini } 2898ccdfe979SStefano Zampini 2899d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2900d71ae5a4SJacob Faibussowitsch { 2901ccdfe979SStefano Zampini Mat_Product *product = C->product; 2902fcdce8c4SStefano Zampini Mat A, B; 2903fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2904fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2905fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2906fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 2907fcdce8c4SStefano Zampini PetscBool flg; 2908fcdce8c4SStefano Zampini cusparseStatus_t stat; 2909fcdce8c4SStefano Zampini MatProductType ptype; 2910fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2911fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2912fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2913fcdce8c4SStefano Zampini #endif 2914b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2915ccdfe979SStefano Zampini 2916ccdfe979SStefano Zampini PetscFunctionBegin; 2917ccdfe979SStefano Zampini MatCheckProduct(C, 1); 291828b400f6SJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 29199566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 292028b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2921fcdce8c4SStefano Zampini mmdata = (MatMatCusparse *)C->product->data; 2922fcdce8c4SStefano Zampini A = product->A; 2923fcdce8c4SStefano Zampini B = product->B; 2924fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2925fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2926fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 292708401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2928fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 292928b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2930fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 293128b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2932fcdce8c4SStefano Zampini goto finalize; 2933fcdce8c4SStefano Zampini } 2934fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 29359566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 293628b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 29379566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 293828b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 293928b400f6SJacob Faibussowitsch PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 294028b400f6SJacob Faibussowitsch PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2941fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2942fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2943fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 294408401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 294508401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 294608401ef6SPierre Jolivet PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 29479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 29489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2949fcdce8c4SStefano Zampini 2950fcdce8c4SStefano Zampini ptype = product->type; 2951b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2952fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 295328b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2954fa046f9fSJunchao Zhang } 2955b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2956fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 295728b400f6SJacob Faibussowitsch PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2958fa046f9fSJunchao Zhang } 2959fcdce8c4SStefano Zampini switch (ptype) { 2960fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2961fcdce8c4SStefano Zampini Amat = Acusp->mat; 2962fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2963fcdce8c4SStefano Zampini break; 2964fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2965fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2966fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2967fcdce8c4SStefano Zampini break; 2968fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2969fcdce8c4SStefano Zampini Amat = Acusp->mat; 2970fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2971fcdce8c4SStefano Zampini break; 2972d71ae5a4SJacob Faibussowitsch default: 2973d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2974fcdce8c4SStefano Zampini } 2975fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 297628b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 297728b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 297828b400f6SJacob Faibussowitsch PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2979fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 2980fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2981fcdce8c4SStefano Zampini Ccsr = (CsrMatrix *)Cmat->mat; 298228b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 298328b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 298428b400f6SJacob Faibussowitsch PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 29859566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2986fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2987fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 29889566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2989b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 29909371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29919371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2992b4285af6SJunchao Zhang #else 29939371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 29949371c9d4SSatish Balay PetscCallCUSPARSE(stat); 29959371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 29969371c9d4SSatish Balay PetscCallCUSPARSE(stat); 2997b4285af6SJunchao Zhang #endif 2998fcdce8c4SStefano Zampini #else 29999371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 30009371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 30019371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3002fcdce8c4SStefano Zampini #endif 30039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 30049566063dSJacob Faibussowitsch PetscCallCUDA(WaitForCUDA()); 30059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3006fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3007fcdce8c4SStefano Zampini finalize: 3008fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 30099566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 30109566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 30119566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3012fcdce8c4SStefano Zampini c->reallocs = 0; 3013fcdce8c4SStefano Zampini C->info.mallocs += 0; 3014fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 3015fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 3016fcdce8c4SStefano Zampini C->num_ass++; 30173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3018ccdfe979SStefano Zampini } 3019fcdce8c4SStefano Zampini 3020d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3021d71ae5a4SJacob Faibussowitsch { 3022fcdce8c4SStefano Zampini Mat_Product *product = C->product; 3023fcdce8c4SStefano Zampini Mat A, B; 3024fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3025fcdce8c4SStefano Zampini Mat_SeqAIJ *a, *b, *c; 3026fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3027fcdce8c4SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 3028fcdce8c4SStefano Zampini PetscInt i, j, m, n, k; 3029fcdce8c4SStefano Zampini PetscBool flg; 3030fcdce8c4SStefano Zampini cusparseStatus_t stat; 3031fcdce8c4SStefano Zampini MatProductType ptype; 3032fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 3033fcdce8c4SStefano Zampini PetscLogDouble flops; 3034fcdce8c4SStefano Zampini PetscBool biscompressed, ciscompressed; 3035fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3036fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 3037fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 3038fcdce8c4SStefano Zampini #else 3039fcdce8c4SStefano Zampini int cnz; 3040fcdce8c4SStefano Zampini #endif 3041b4285af6SJunchao Zhang cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3042fcdce8c4SStefano Zampini 3043fcdce8c4SStefano Zampini PetscFunctionBegin; 3044fcdce8c4SStefano Zampini MatCheckProduct(C, 1); 304528b400f6SJacob Faibussowitsch PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3046fcdce8c4SStefano Zampini A = product->A; 3047fcdce8c4SStefano Zampini B = product->B; 30489566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 304928b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 30509566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 305128b400f6SJacob Faibussowitsch PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3052fcdce8c4SStefano Zampini a = (Mat_SeqAIJ *)A->data; 3053fcdce8c4SStefano Zampini b = (Mat_SeqAIJ *)B->data; 3054fcdce8c4SStefano Zampini /* product data */ 30559566063dSJacob Faibussowitsch PetscCall(PetscNew(&mmdata)); 3056fcdce8c4SStefano Zampini C->product->data = mmdata; 3057fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 3058fcdce8c4SStefano Zampini 30599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 30609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3061d60bce21SJunchao Zhang Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3062d60bce21SJunchao Zhang Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 306308401ef6SPierre Jolivet PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 306408401ef6SPierre Jolivet PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3065d60bce21SJunchao Zhang 3066fcdce8c4SStefano Zampini ptype = product->type; 3067b94d7dedSBarry Smith if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3068fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3069fa046f9fSJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3070fa046f9fSJunchao Zhang } 3071b94d7dedSBarry Smith if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3072fa046f9fSJunchao Zhang ptype = MATPRODUCT_AB; 3073fa046f9fSJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3074fa046f9fSJunchao Zhang } 3075fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 3076fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 3077fcdce8c4SStefano Zampini switch (ptype) { 3078fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3079fcdce8c4SStefano Zampini m = A->rmap->n; 3080fcdce8c4SStefano Zampini n = B->cmap->n; 3081fcdce8c4SStefano Zampini k = A->cmap->n; 3082fcdce8c4SStefano Zampini Amat = Acusp->mat; 3083fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3084fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3085fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3086fcdce8c4SStefano Zampini break; 3087fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3088fcdce8c4SStefano Zampini m = A->cmap->n; 3089fcdce8c4SStefano Zampini n = B->cmap->n; 3090fcdce8c4SStefano Zampini k = A->rmap->n; 30919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3092fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 3093fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 3094fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3095fcdce8c4SStefano Zampini break; 3096fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 3097fcdce8c4SStefano Zampini m = A->rmap->n; 3098fcdce8c4SStefano Zampini n = B->rmap->n; 3099fcdce8c4SStefano Zampini k = A->cmap->n; 31009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3101fcdce8c4SStefano Zampini Amat = Acusp->mat; 3102fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 3103fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3104fcdce8c4SStefano Zampini break; 3105d71ae5a4SJacob Faibussowitsch default: 3106d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3107fcdce8c4SStefano Zampini } 3108fcdce8c4SStefano Zampini 3109fcdce8c4SStefano Zampini /* create cusparse matrix */ 31109566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C, m, n, m, n)); 31119566063dSJacob Faibussowitsch PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3112fcdce8c4SStefano Zampini c = (Mat_SeqAIJ *)C->data; 3113fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3114fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3115fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 3116fcdce8c4SStefano Zampini 3117fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 3118fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3119fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 31209566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 31219566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3122fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3123fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3124fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3125fcdce8c4SStefano Zampini } else { 3126fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 3127fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 3128fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 3129fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 3130fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 3131fcdce8c4SStefano Zampini } 3132fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3133fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 3134fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 3135fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 3136fcdce8c4SStefano Zampini Ccsr->num_cols = n; 3137fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 31389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 31399566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 31409566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3141f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3142f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3143f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 31449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 31469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3147fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3148d460d7bfSJunchao Zhang PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3149fcdce8c4SStefano Zampini c->nz = 0; 3150fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3151fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3152fcdce8c4SStefano Zampini goto finalizesym; 3153fcdce8c4SStefano Zampini } 3154fcdce8c4SStefano Zampini 315528b400f6SJacob Faibussowitsch PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 315628b400f6SJacob Faibussowitsch PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3157fcdce8c4SStefano Zampini Acsr = (CsrMatrix *)Amat->mat; 3158fcdce8c4SStefano Zampini if (!biscompressed) { 3159fcdce8c4SStefano Zampini Bcsr = (CsrMatrix *)Bmat->mat; 3160fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3161fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 3162fcdce8c4SStefano Zampini #endif 3163fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 3164fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3165fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 3166fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 3167fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 3168fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 3169fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 3170fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 3171fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3172fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3173fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 31749566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3175fcdce8c4SStefano Zampini } 3176fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3177fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 3178fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3179fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 31809371c9d4SSatish Balay stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 31819371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3182fcdce8c4SStefano Zampini } 3183fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 3184fcdce8c4SStefano Zampini #endif 3185fcdce8c4SStefano Zampini } 318628b400f6SJacob Faibussowitsch PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 318728b400f6SJacob Faibussowitsch PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3188fcdce8c4SStefano Zampini /* precompute flops count */ 3189fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 3190fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3191fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 3192fcdce8c4SStefano Zampini const PetscInt en = a->i[i + 1]; 3193fcdce8c4SStefano Zampini for (j = st; j < en; j++) { 3194fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 3195fcdce8c4SStefano Zampini flops += 2. * (b->i[brow + 1] - b->i[brow]); 3196fcdce8c4SStefano Zampini } 3197fcdce8c4SStefano Zampini } 3198fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 3199fcdce8c4SStefano Zampini for (i = 0, flops = 0; i < A->rmap->n; i++) { 3200fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i + 1] - a->i[i]; 3201fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3202fcdce8c4SStefano Zampini flops += (2. * anzi) * bnzi; 3203fcdce8c4SStefano Zampini } 3204fcdce8c4SStefano Zampini } else { /* TODO */ 3205fcdce8c4SStefano Zampini flops = 0.; 3206fcdce8c4SStefano Zampini } 3207fcdce8c4SStefano Zampini 3208fcdce8c4SStefano Zampini mmdata->flops = flops; 32099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3210b4285af6SJunchao Zhang 3211fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 32129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 32131ffab3bdSJunchao Zhang // cuda-12.2 requires non-null csrRowOffsets 32141ffab3bdSJunchao Zhang stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 32159371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32169566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3217b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3218b4285af6SJunchao Zhang { 3219b4285af6SJunchao Zhang /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3220b4285af6SJunchao Zhang We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3221b4285af6SJunchao Zhang */ 3222b4285af6SJunchao Zhang void *dBuffer1 = NULL; 3223b4285af6SJunchao Zhang void *dBuffer2 = NULL; 3224b4285af6SJunchao Zhang void *dBuffer3 = NULL; 3225b4285af6SJunchao Zhang /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3226b4285af6SJunchao Zhang size_t bufferSize1 = 0; 3227b4285af6SJunchao Zhang size_t bufferSize2 = 0; 3228b4285af6SJunchao Zhang size_t bufferSize3 = 0; 3229b4285af6SJunchao Zhang size_t bufferSize4 = 0; 3230b4285af6SJunchao Zhang size_t bufferSize5 = 0; 3231b4285af6SJunchao Zhang 3232b4285af6SJunchao Zhang /* ask bufferSize1 bytes for external memory */ 32339371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 32349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3236b4285af6SJunchao Zhang /* inspect the matrices A and B to understand the memory requirement for the next step */ 32379371c9d4SSatish Balay stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 32389371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3239b4285af6SJunchao Zhang 32409371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 32419371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 32439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 32449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 32459371c9d4SSatish Balay stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 32469371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32479566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer1)); 32489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer2)); 3249b4285af6SJunchao Zhang 3250b4285af6SJunchao Zhang /* get matrix C non-zero entries C_nnz1 */ 32519566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3252b4285af6SJunchao Zhang c->nz = (PetscInt)C_nnz1; 3253b4285af6SJunchao Zhang /* allocate matrix C */ 32549371c9d4SSatish Balay Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32559371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 32569371c9d4SSatish Balay Ccsr->values = new THRUSTARRAY(c->nz); 32579371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3258b4285af6SJunchao Zhang /* update matC with the new pointers */ 32599371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 32609371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3261b4285af6SJunchao Zhang 32629371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 32639371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 32659371c9d4SSatish Balay stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 32669371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32679566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(dBuffer3)); 32689371c9d4SSatish Balay stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 32699371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32709566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3271b4285af6SJunchao Zhang } 3272ae37ee31SJunchao Zhang #else 3273b4285af6SJunchao Zhang size_t bufSize2; 3274fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 32759371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 32769371c9d4SSatish Balay PetscCallCUSPARSE(stat); 32779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3278fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 32799371c9d4SSatish Balay stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 32809371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3281fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 32829371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 32839371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3284fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 3285fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 3286fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3287fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3288fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 32899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3290fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 32919371c9d4SSatish Balay stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 32929371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3293fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 32949566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3295fcdce8c4SStefano Zampini c->nz = (PetscInt)C_nnz1; 32969371c9d4SSatish Balay PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 32979371c9d4SSatish Balay mmdata->mmBufferSize / 1024)); 3298fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 32999566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3300fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 33019566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 33029371c9d4SSatish Balay stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 33039371c9d4SSatish Balay PetscCallCUSPARSE(stat); 33049371c9d4SSatish Balay stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 33059371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3306ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3307fcdce8c4SStefano Zampini #else 33089566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 33099371c9d4SSatish Balay stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33109371c9d4SSatish Balay Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 33119371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3312fcdce8c4SStefano Zampini c->nz = cnz; 3313fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 33149566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3315fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 33169566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3317fcdce8c4SStefano Zampini 33189566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3319fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3320fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3321fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 33229371c9d4SSatish Balay stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 33239371c9d4SSatish Balay Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 33249371c9d4SSatish Balay PetscCallCUSPARSE(stat); 3325fcdce8c4SStefano Zampini #endif 33269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(mmdata->flops)); 33279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3328fcdce8c4SStefano Zampini finalizesym: 3329fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 33309f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 33319f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3332fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 33337de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3334fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3335fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3336fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3337fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 3338fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 3339fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3342fcdce8c4SStefano Zampini } else { 3343fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 3344fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 33459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 33469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3347fcdce8c4SStefano Zampini } 3348fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 3349fcdce8c4SStefano Zampini PetscInt r = 0; 3350fcdce8c4SStefano Zampini c->i[0] = 0; 3351fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 3352fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 3353fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 3354fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r + 1] = old; 3355fcdce8c4SStefano Zampini } 3356fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3357fcdce8c4SStefano Zampini } 33589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 33599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 33609566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 3361fcdce8c4SStefano Zampini c->maxnz = c->nz; 3362fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 3363fcdce8c4SStefano Zampini c->rmax = 0; 3364fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 3365fcdce8c4SStefano Zampini const PetscInt nn = c->i[k + 1] - c->i[k]; 3366fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 3367fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3368fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 3369fcdce8c4SStefano Zampini } 33709566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(C)); 33719566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 3372fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 3373fcdce8c4SStefano Zampini 3374fcdce8c4SStefano Zampini C->nonzerostate++; 33759566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 33769566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 3377fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 3378fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3379fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 3380fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 3381fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 3382abb89eb1SStefano Zampini if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3383fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 3384fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 3385fcdce8c4SStefano Zampini } 3386fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 33873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3388fcdce8c4SStefano Zampini } 3389fcdce8c4SStefano Zampini 3390fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3391fcdce8c4SStefano Zampini 3392fcdce8c4SStefano Zampini /* handles sparse or dense B */ 3393d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3394d71ae5a4SJacob Faibussowitsch { 3395fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 3396fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3397fcdce8c4SStefano Zampini 3398fcdce8c4SStefano Zampini PetscFunctionBegin; 3399fcdce8c4SStefano Zampini MatCheckProduct(mat, 1); 34009566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 340148a46eb9SPierre Jolivet if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3402fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 3403fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 340448a46eb9SPierre Jolivet if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3405fcdce8c4SStefano Zampini } 340665e4b4d4SStefano Zampini if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 340765e4b4d4SStefano Zampini PetscBool usecpu = PETSC_FALSE; 340865e4b4d4SStefano Zampini switch (product->type) { 340965e4b4d4SStefano Zampini case MATPRODUCT_AB: 341065e4b4d4SStefano Zampini if (product->api_user) { 3411d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 34129566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3413d0609cedSBarry Smith PetscOptionsEnd(); 341465e4b4d4SStefano Zampini } else { 3415d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 34169566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3417d0609cedSBarry Smith PetscOptionsEnd(); 341865e4b4d4SStefano Zampini } 341965e4b4d4SStefano Zampini break; 342065e4b4d4SStefano Zampini case MATPRODUCT_AtB: 342165e4b4d4SStefano Zampini if (product->api_user) { 3422d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 34239566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3424d0609cedSBarry Smith PetscOptionsEnd(); 342565e4b4d4SStefano Zampini } else { 3426d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 34279566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3428d0609cedSBarry Smith PetscOptionsEnd(); 342965e4b4d4SStefano Zampini } 343065e4b4d4SStefano Zampini break; 343165e4b4d4SStefano Zampini case MATPRODUCT_PtAP: 343265e4b4d4SStefano Zampini if (product->api_user) { 3433d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 34349566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3435d0609cedSBarry Smith PetscOptionsEnd(); 343665e4b4d4SStefano Zampini } else { 3437d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 34389566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3439d0609cedSBarry Smith PetscOptionsEnd(); 344065e4b4d4SStefano Zampini } 344165e4b4d4SStefano Zampini break; 344265e4b4d4SStefano Zampini case MATPRODUCT_RARt: 344365e4b4d4SStefano Zampini if (product->api_user) { 3444d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 34459566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3446d0609cedSBarry Smith PetscOptionsEnd(); 344765e4b4d4SStefano Zampini } else { 3448d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 34499566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3450d0609cedSBarry Smith PetscOptionsEnd(); 345165e4b4d4SStefano Zampini } 345265e4b4d4SStefano Zampini break; 345365e4b4d4SStefano Zampini case MATPRODUCT_ABC: 345465e4b4d4SStefano Zampini if (product->api_user) { 3455d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 34569566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3457d0609cedSBarry Smith PetscOptionsEnd(); 345865e4b4d4SStefano Zampini } else { 3459d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 34609566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3461d0609cedSBarry Smith PetscOptionsEnd(); 346265e4b4d4SStefano Zampini } 346365e4b4d4SStefano Zampini break; 3464d71ae5a4SJacob Faibussowitsch default: 3465d71ae5a4SJacob Faibussowitsch break; 346665e4b4d4SStefano Zampini } 346765e4b4d4SStefano Zampini if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 346865e4b4d4SStefano Zampini } 346965e4b4d4SStefano Zampini /* dispatch */ 3470fcdce8c4SStefano Zampini if (isdense) { 3471ccdfe979SStefano Zampini switch (product->type) { 3472ccdfe979SStefano Zampini case MATPRODUCT_AB: 3473ccdfe979SStefano Zampini case MATPRODUCT_AtB: 3474ccdfe979SStefano Zampini case MATPRODUCT_ABt: 3475ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 3476ccdfe979SStefano Zampini case MATPRODUCT_RARt: 3477fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 34789566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3479fcdce8c4SStefano Zampini } else { 3480fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3481fcdce8c4SStefano Zampini } 3482fcdce8c4SStefano Zampini break; 3483d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3484d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3485d71ae5a4SJacob Faibussowitsch break; 3486d71ae5a4SJacob Faibussowitsch default: 3487d71ae5a4SJacob Faibussowitsch break; 3488ccdfe979SStefano Zampini } 3489fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 3490fcdce8c4SStefano Zampini switch (product->type) { 3491fcdce8c4SStefano Zampini case MATPRODUCT_AB: 3492fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 3493d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 3494d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3495d71ae5a4SJacob Faibussowitsch break; 3496fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 3497fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 3498d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 3499d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3500d71ae5a4SJacob Faibussowitsch break; 3501d71ae5a4SJacob Faibussowitsch default: 3502d71ae5a4SJacob Faibussowitsch break; 3503fcdce8c4SStefano Zampini } 3504fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 35059566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3506fcdce8c4SStefano Zampini } 35073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3508ccdfe979SStefano Zampini } 3509ccdfe979SStefano Zampini 3510d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3511d71ae5a4SJacob Faibussowitsch { 35129ae82921SPaul Mullowney PetscFunctionBegin; 35139566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 35143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3515e6e9a74fSStefano Zampini } 3516e6e9a74fSStefano Zampini 3517d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3518d71ae5a4SJacob Faibussowitsch { 3519e6e9a74fSStefano Zampini PetscFunctionBegin; 35209566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 35213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3522e6e9a74fSStefano Zampini } 3523e6e9a74fSStefano Zampini 3524d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3525d71ae5a4SJacob Faibussowitsch { 3526e6e9a74fSStefano Zampini PetscFunctionBegin; 35279566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 35283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3529e6e9a74fSStefano Zampini } 3530e6e9a74fSStefano Zampini 3531d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3532d71ae5a4SJacob Faibussowitsch { 3533e6e9a74fSStefano Zampini PetscFunctionBegin; 35349566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 35353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 35369ae82921SPaul Mullowney } 35379ae82921SPaul Mullowney 3538d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3539d71ae5a4SJacob Faibussowitsch { 3540ca45077fSPaul Mullowney PetscFunctionBegin; 35419566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 35423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3543ca45077fSPaul Mullowney } 3544ca45077fSPaul Mullowney 3545d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3546d71ae5a4SJacob Faibussowitsch { 3547a0e72f99SJunchao Zhang int i = blockIdx.x * blockDim.x + threadIdx.x; 3548a0e72f99SJunchao Zhang if (i < n) y[idx[i]] += x[i]; 3549a0e72f99SJunchao Zhang } 3550a0e72f99SJunchao Zhang 3551afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3553d71ae5a4SJacob Faibussowitsch { 35549ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3555aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 35569ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3557e6e9a74fSStefano Zampini PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3558e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3559e6e9a74fSStefano Zampini PetscBool compressed; 3560afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3561afb2bd1cSJunchao Zhang PetscInt nx, ny; 3562afb2bd1cSJunchao Zhang #endif 35636e111a19SKarl Rupp 35649ae82921SPaul Mullowney PetscFunctionBegin; 356508401ef6SPierre Jolivet PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3566cbc6b225SStefano Zampini if (!a->nz) { 3567995bce04SJacob Faibussowitsch if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3568995bce04SJacob Faibussowitsch else PetscCall(VecSeq_CUDA::Set(zz, 0)); 35693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3570e6e9a74fSStefano Zampini } 357134d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 35729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3573e6e9a74fSStefano Zampini if (!trans) { 35749ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 35755f80ce2aSJacob Faibussowitsch PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3576e6e9a74fSStefano Zampini } else { 35771a2c6b5cSJunchao Zhang if (herm || !A->form_explicit_transpose) { 3578e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3579e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3580e6e9a74fSStefano Zampini } else { 35819566063dSJacob Faibussowitsch if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3582e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3583e6e9a74fSStefano Zampini } 3584e6e9a74fSStefano Zampini } 3585e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3586e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3587213423ffSJunchao Zhang 3588e6e9a74fSStefano Zampini try { 35899566063dSJacob Faibussowitsch PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 359069d47153SPierre Jolivet if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 35919566063dSJacob Faibussowitsch else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3592afb2bd1cSJunchao Zhang 35939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3594e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3595afb2bd1cSJunchao Zhang /* z = A x + beta y. 3596afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3597afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3598afb2bd1cSJunchao Zhang */ 3599e6e9a74fSStefano Zampini xptr = xarray; 3600afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3601213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3602afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3603afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3604afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 3605afb2bd1cSJunchao Zhang */ 3606afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3607afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3608fe5544b9SJunchao Zhang nx = mat->num_cols; // since y = Ax 3609afb2bd1cSJunchao Zhang ny = mat->num_rows; 3610afb2bd1cSJunchao Zhang } 3611afb2bd1cSJunchao Zhang #endif 3612e6e9a74fSStefano Zampini } else { 3613afb2bd1cSJunchao Zhang /* z = A^T x + beta y 3614afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3615afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3616afb2bd1cSJunchao Zhang */ 3617afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3618e6e9a74fSStefano Zampini dptr = zarray; 3619e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3620afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 3621e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3622d0967f54SJacob Faibussowitsch 3623d0967f54SJacob Faibussowitsch thrust::for_each( 3624d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC) 3625d0967f54SJacob Faibussowitsch thrust::cuda::par.on(PetscDefaultCudaStream), 3626d0967f54SJacob Faibussowitsch #endif 3627d0967f54SJacob Faibussowitsch thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 36289371c9d4SSatish Balay thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3629e6e9a74fSStefano Zampini } 3630afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3631afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3632afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3633fe5544b9SJunchao Zhang nx = mat->num_rows; // since y = A^T x 3634afb2bd1cSJunchao Zhang ny = mat->num_cols; 3635afb2bd1cSJunchao Zhang } 3636afb2bd1cSJunchao Zhang #endif 3637e6e9a74fSStefano Zampini } 36389ae82921SPaul Mullowney 3639afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 3640aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3641afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3642fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3643fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3644fe5544b9SJunchao Zhang #else 3645fe5544b9SJunchao Zhang cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3646fe5544b9SJunchao Zhang #endif 3647fe5544b9SJunchao Zhang 36485f80ce2aSJacob Faibussowitsch PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3649fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3650fe5544b9SJunchao Zhang if (!matDescr) { 3651fe5544b9SJunchao Zhang CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3652fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3653fe5544b9SJunchao Zhang } 3654fe5544b9SJunchao Zhang #endif 3655fe5544b9SJunchao Zhang 3656afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 36579566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 36589566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 36599371c9d4SSatish Balay PetscCallCUSPARSE( 3660fe5544b9SJunchao Zhang cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 36619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3662fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3663fe5544b9SJunchao Zhang PetscCallCUSPARSE( 3664fe5544b9SJunchao Zhang cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3665fe5544b9SJunchao Zhang #endif 3666afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3667afb2bd1cSJunchao Zhang } else { 3668afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 36699566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 36709566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3671afb2bd1cSJunchao Zhang } 3672afb2bd1cSJunchao Zhang 3673fe5544b9SJunchao Zhang PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3674afb2bd1cSJunchao Zhang #else 36757656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 36769371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3677afb2bd1cSJunchao Zhang #endif 3678aa372e3fSPaul Mullowney } else { 3679213423ffSJunchao Zhang if (cusparsestruct->nrows) { 3680afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3681afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3682afb2bd1cSJunchao Zhang #else 3683301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 36849371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3685afb2bd1cSJunchao Zhang #endif 3686a65300a6SPaul Mullowney } 3687aa372e3fSPaul Mullowney } 36889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3689aa372e3fSPaul Mullowney 3690e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3691213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3692213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3693995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3694e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3695995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 36967656d835SStefano Zampini } 3697213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3698995bce04SJacob Faibussowitsch PetscCall(VecSeq_CUDA::Set(zz, 0)); 36997656d835SStefano Zampini } 37007656d835SStefano Zampini 3701213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3702213423ffSJunchao Zhang if (compressed) { 37039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 37046497c311SBarry Smith PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 37056497c311SBarry Smith ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 37069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3707e6e9a74fSStefano Zampini } 3708e6e9a74fSStefano Zampini } else { 3709995bce04SJacob Faibussowitsch if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3710e6e9a74fSStefano Zampini } 37119566063dSJacob Faibussowitsch PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 37129566063dSJacob Faibussowitsch if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 37139566063dSJacob Faibussowitsch else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3714d71ae5a4SJacob Faibussowitsch } catch (char *ex) { 3715d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3716d71ae5a4SJacob Faibussowitsch } 3717e6e9a74fSStefano Zampini if (yy) { 37189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3719e6e9a74fSStefano Zampini } else { 37209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3721e6e9a74fSStefano Zampini } 37223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37239ae82921SPaul Mullowney } 37249ae82921SPaul Mullowney 3725d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3726d71ae5a4SJacob Faibussowitsch { 3727ca45077fSPaul Mullowney PetscFunctionBegin; 37289566063dSJacob Faibussowitsch PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 37293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3730ca45077fSPaul Mullowney } 3731ca45077fSPaul Mullowney 3732d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3733d71ae5a4SJacob Faibussowitsch { 3734042217e8SBarry Smith PetscFunctionBegin; 37359566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 37363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37379ae82921SPaul Mullowney } 37389ae82921SPaul Mullowney 3739e057df02SPaul Mullowney /*@ 374053220ed8SBarry Smith MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs 37419ae82921SPaul Mullowney 3742d083f849SBarry Smith Collective 37439ae82921SPaul Mullowney 37449ae82921SPaul Mullowney Input Parameters: 374511a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 37469ae82921SPaul Mullowney . m - number of rows 37479ae82921SPaul Mullowney . n - number of columns 374820f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 374920f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 37509ae82921SPaul Mullowney 37519ae82921SPaul Mullowney Output Parameter: 37529ae82921SPaul Mullowney . A - the matrix 37539ae82921SPaul Mullowney 37542ef1f0ffSBarry Smith Level: intermediate 37552ef1f0ffSBarry Smith 37562ef1f0ffSBarry Smith Notes: 37572920cce0SJacob Faibussowitsch This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 37582920cce0SJacob Faibussowitsch calculations. For good matrix assembly performance the user should preallocate the matrix 37592920cce0SJacob Faibussowitsch storage by setting the parameter `nz` (or the array `nnz`). 37602920cce0SJacob Faibussowitsch 376111a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 37629ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 376311a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 37649ae82921SPaul Mullowney 376511a5261eSBarry Smith The AIJ format, also called 37662ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 37679ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 376820f4b53cSBarry Smith either one (as in Fortran) or zero. 37699ae82921SPaul Mullowney 37709ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 37712ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 377220f4b53cSBarry Smith allocation. 37739ae82921SPaul Mullowney 377453220ed8SBarry Smith When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()` 377553220ed8SBarry Smith 377653220ed8SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`, 377753220ed8SBarry Smith `MatSetPreallocationCOO()`, `MatSetValuesCOO()` 37789ae82921SPaul Mullowney @*/ 3779d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3780d71ae5a4SJacob Faibussowitsch { 37819ae82921SPaul Mullowney PetscFunctionBegin; 37829566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 37839566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 37849566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 37859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 37863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 37879ae82921SPaul Mullowney } 37889ae82921SPaul Mullowney 3789d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3790d71ae5a4SJacob Faibussowitsch { 37919ae82921SPaul Mullowney PetscFunctionBegin; 37929ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 37932c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 37949ae82921SPaul Mullowney } else { 37959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3796aa372e3fSPaul Mullowney } 37979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 37989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 37999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 38009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 38019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 38029566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 38039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 38049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 38059566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 38069566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 38079566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 38083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38099ae82921SPaul Mullowney } 38109ae82921SPaul Mullowney 3811ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 381295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3813d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3814d71ae5a4SJacob Faibussowitsch { 38159ff858a8SKarl Rupp PetscFunctionBegin; 38169566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 38179566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 38183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 38199ff858a8SKarl Rupp } 38209ff858a8SKarl Rupp 3821d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3822d71ae5a4SJacob Faibussowitsch { 3823a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3824039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cy; 3825039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cx; 3826039c6fbaSStefano Zampini PetscScalar *ay; 3827039c6fbaSStefano Zampini const PetscScalar *ax; 3828039c6fbaSStefano Zampini CsrMatrix *csry, *csrx; 3829e6e9a74fSStefano Zampini 383095639643SRichard Tran Mills PetscFunctionBegin; 3831a49f1ed0SStefano Zampini cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3832a49f1ed0SStefano Zampini cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3833039c6fbaSStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 38349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 38359566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 38363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 383795639643SRichard Tran Mills } 3838039c6fbaSStefano Zampini /* if we are here, it means both matrices are bound to GPU */ 38399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 38409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 38415f80ce2aSJacob Faibussowitsch PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 38425f80ce2aSJacob Faibussowitsch PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3843039c6fbaSStefano Zampini csry = (CsrMatrix *)cy->mat->mat; 3844039c6fbaSStefano Zampini csrx = (CsrMatrix *)cx->mat->mat; 3845039c6fbaSStefano Zampini /* see if we can turn this into a cublas axpy */ 3846039c6fbaSStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3847039c6fbaSStefano Zampini bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3848ad540459SPierre Jolivet if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3849039c6fbaSStefano Zampini if (eq) str = SAME_NONZERO_PATTERN; 3850039c6fbaSStefano Zampini } 3851d2be01edSStefano Zampini /* spgeam is buggy with one column */ 3852d2be01edSStefano Zampini if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3853039c6fbaSStefano Zampini 3854039c6fbaSStefano Zampini if (str == SUBSET_NONZERO_PATTERN) { 3855039c6fbaSStefano Zampini PetscScalar b = 1.0; 3856039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3857039c6fbaSStefano Zampini size_t bufferSize; 3858039c6fbaSStefano Zampini void *buffer; 3859039c6fbaSStefano Zampini #endif 3860039c6fbaSStefano Zampini 38619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38639566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3864039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 38659371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38669371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 38679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 38689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38699371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38709371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 38719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(buffer)); 3874039c6fbaSStefano Zampini #else 38759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38769371c9d4SSatish Balay PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 38779371c9d4SSatish Balay csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 38789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 38799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3880039c6fbaSStefano Zampini #endif 38819566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 38829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3885039c6fbaSStefano Zampini } else if (str == SAME_NONZERO_PATTERN) { 3886a587d139SMark cublasHandle_t cublasv2handle; 3887a587d139SMark PetscBLASInt one = 1, bnz = 1; 3888039c6fbaSStefano Zampini 38899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 38909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 38919566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 38929566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(x->nz, &bnz)); 38939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 38949566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 38959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * bnz)); 38969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 38979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 38989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 38999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 3900039c6fbaSStefano Zampini } else { 39019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 39029566063dSJacob Faibussowitsch PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3903a587d139SMark } 39043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 390595639643SRichard Tran Mills } 390695639643SRichard Tran Mills 3907d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3908d71ae5a4SJacob Faibussowitsch { 390933c9ba73SStefano Zampini Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 391033c9ba73SStefano Zampini PetscScalar *ay; 391133c9ba73SStefano Zampini cublasHandle_t cublasv2handle; 391233c9ba73SStefano Zampini PetscBLASInt one = 1, bnz = 1; 391333c9ba73SStefano Zampini 391433c9ba73SStefano Zampini PetscFunctionBegin; 39159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 39169566063dSJacob Faibussowitsch PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 39179566063dSJacob Faibussowitsch PetscCall(PetscBLASIntCast(y->nz, &bnz)); 39189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 39199566063dSJacob Faibussowitsch PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 39209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(bnz)); 39219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 39229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 39239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(Y)); 39243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 392533c9ba73SStefano Zampini } 392633c9ba73SStefano Zampini 3927d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3928d71ae5a4SJacob Faibussowitsch { 39297e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3930a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 39317e8381f9SStefano Zampini 39323fa6b06aSMark Adams PetscFunctionBegin; 39333fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 39343fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 39357e8381f9SStefano Zampini if (spptr->mat) { 39367e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 39377e8381f9SStefano Zampini if (matrix->values) { 39387e8381f9SStefano Zampini both = PETSC_TRUE; 39397e8381f9SStefano Zampini thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39407e8381f9SStefano Zampini } 39417e8381f9SStefano Zampini } 39427e8381f9SStefano Zampini if (spptr->matTranspose) { 39437e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3944ad540459SPierre Jolivet if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 39457e8381f9SStefano Zampini } 39463fa6b06aSMark Adams } 39479566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 39489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 39497e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3950a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 39513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39523fa6b06aSMark Adams } 39533fa6b06aSMark Adams 39542c55c4ccSJose E. Roman static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 395503db1824SAlex Lindsay { 395603db1824SAlex Lindsay PetscFunctionBegin; 395703db1824SAlex Lindsay *m = PETSC_MEMTYPE_CUDA; 395803db1824SAlex Lindsay PetscFunctionReturn(PETSC_SUCCESS); 395903db1824SAlex Lindsay } 396003db1824SAlex Lindsay 3961d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 3962d71ae5a4SJacob Faibussowitsch { 3963a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3964a587d139SMark 3965a587d139SMark PetscFunctionBegin; 39669a14fc28SStefano Zampini if (A->factortype != MAT_FACTOR_NONE) { 39679a14fc28SStefano Zampini A->boundtocpu = flg; 39683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 39699a14fc28SStefano Zampini } 3970a587d139SMark if (flg) { 39719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 3972a587d139SMark 397333c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJ; 3974a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3975a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3976a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3977a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3978a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3979a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3980a587d139SMark A->ops->multhermitiantranspose = NULL; 3981a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3982fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 398303db1824SAlex Lindsay A->ops->getcurrentmemtype = NULL; 39849566063dSJacob Faibussowitsch PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 39859566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 39869566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 39879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 39889566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 39899566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 39909566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3991a587d139SMark } else { 399233c9ba73SStefano Zampini A->ops->scale = MatScale_SeqAIJCUSPARSE; 3993a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3994a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3995a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3996a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3997a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3998a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3999a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4000a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4001fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 400203db1824SAlex Lindsay A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 400367a45760SJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 400467a45760SJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 400567a45760SJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 400667a45760SJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 400767a45760SJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 400867a45760SJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 40097ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 40107ee59b9bSJunchao Zhang 40119566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 40129566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40139566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 40149566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 40159566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 40169566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4017a587d139SMark } 4018a587d139SMark A->boundtocpu = flg; 40194d12350bSJunchao Zhang if (flg && a->inode.size_csr) { 4020ea500dcfSRichard Tran Mills a->inode.use = PETSC_TRUE; 4021ea500dcfSRichard Tran Mills } else { 4022ea500dcfSRichard Tran Mills a->inode.use = PETSC_FALSE; 4023ea500dcfSRichard Tran Mills } 40243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4025a587d139SMark } 4026a587d139SMark 40278eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4028d71ae5a4SJacob Faibussowitsch { 402949735bf3SStefano Zampini Mat B; 40309ae82921SPaul Mullowney 40319ae82921SPaul Mullowney PetscFunctionBegin; 40329566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 403349735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 40349566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 403549735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 40369566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 403749735bf3SStefano Zampini } 403849735bf3SStefano Zampini B = *newmat; 403949735bf3SStefano Zampini 40409566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 40419566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 404234136279SStefano Zampini 404349735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 40449ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 4045e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 40469566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40479566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40489566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 40491a2c6b5cSJunchao Zhang spptr->format = MAT_CUSPARSE_CSR; 4050d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4051b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4052a435da06SStefano Zampini spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4053a435da06SStefano Zampini #else 4054d8132acaSStefano Zampini spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4055a435da06SStefano Zampini #endif 4056d8132acaSStefano Zampini spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4057d8132acaSStefano Zampini spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4058d8132acaSStefano Zampini #endif 40591a2c6b5cSJunchao Zhang B->spptr = spptr; 40609ae82921SPaul Mullowney } else { 4061e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 4062e6e9a74fSStefano Zampini 40639566063dSJacob Faibussowitsch PetscCall(PetscNew(&spptr)); 40649566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 40659566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4066e6e9a74fSStefano Zampini B->spptr = spptr; 40679ae82921SPaul Mullowney } 4068e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 406949735bf3SStefano Zampini } 4070693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 40719ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 40721a2c6b5cSJunchao Zhang B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 40739ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 407495639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4075693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 407603db1824SAlex Lindsay B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 40772205254eSKarl Rupp 40789566063dSJacob Faibussowitsch PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 40799566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 40809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4081ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 40829566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4083ae48a8d0SStefano Zampini #endif 40849566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 40853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40869ae82921SPaul Mullowney } 40879ae82921SPaul Mullowney 4088d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4089d71ae5a4SJacob Faibussowitsch { 409002fe1965SBarry Smith PetscFunctionBegin; 40919566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(B)); 40929566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 40933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 409402fe1965SBarry Smith } 409502fe1965SBarry Smith 40963ca39a21SBarry Smith /*MC 409753220ed8SBarry Smith MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 4098e057df02SPaul Mullowney 4099e057df02SPaul Mullowney Options Database Keys: 410053220ed8SBarry Smith + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 410153220ed8SBarry Smith . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 41022ef1f0ffSBarry Smith Other options include ell (ellpack) or hyb (hybrid). 410353220ed8SBarry Smith . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 410453220ed8SBarry Smith - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU 4105e057df02SPaul Mullowney 4106e057df02SPaul Mullowney Level: beginner 4107e057df02SPaul Mullowney 410853220ed8SBarry Smith Notes: 410953220ed8SBarry Smith These matrices can be in either CSR, ELL, or HYB format. 411053220ed8SBarry Smith 411153220ed8SBarry Smith All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library. 411253220ed8SBarry Smith 411353220ed8SBarry Smith Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens 411453220ed8SBarry Smith if some integer values passed in do not fit in `int`. 411553220ed8SBarry Smith 41161cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4117e057df02SPaul Mullowney M*/ 41187f756511SDominic Meiser 4119d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4120d71ae5a4SJacob Faibussowitsch { 412142c9c57cSBarry Smith PetscFunctionBegin; 41229566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 41239566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 41249566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 41259566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 41263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 412742c9c57cSBarry Smith } 412829b38603SBarry Smith 41292c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4130d71ae5a4SJacob Faibussowitsch { 41312c4ab24aSJunchao Zhang Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4132cbc6b225SStefano Zampini 4133cbc6b225SStefano Zampini PetscFunctionBegin; 41342c4ab24aSJunchao Zhang if (cusp) { 41352c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 41362c4ab24aSJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 41372c4ab24aSJunchao Zhang delete cusp->workVector; 41382c4ab24aSJunchao Zhang delete cusp->rowoffsets_gpu; 41392c4ab24aSJunchao Zhang delete cusp->csr2csc_i; 41402c4ab24aSJunchao Zhang delete cusp->coords; 41412c4ab24aSJunchao Zhang if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 41422c4ab24aSJunchao Zhang PetscCall(PetscFree(mat->spptr)); 41437f756511SDominic Meiser } 41443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41457f756511SDominic Meiser } 41467f756511SDominic Meiser 4147d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4148d71ae5a4SJacob Faibussowitsch { 41497f756511SDominic Meiser PetscFunctionBegin; 41507f756511SDominic Meiser if (*mat) { 41517f756511SDominic Meiser delete (*mat)->values; 41527f756511SDominic Meiser delete (*mat)->column_indices; 41537f756511SDominic Meiser delete (*mat)->row_offsets; 41547f756511SDominic Meiser delete *mat; 41557f756511SDominic Meiser *mat = 0; 41567f756511SDominic Meiser } 41573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41587f756511SDominic Meiser } 41597f756511SDominic Meiser 4160b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4161d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4162d71ae5a4SJacob Faibussowitsch { 41637f756511SDominic Meiser PetscFunctionBegin; 41647f756511SDominic Meiser if (*trifactor) { 41659566063dSJacob Faibussowitsch if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4166261a78b4SJunchao Zhang if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 41679566063dSJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 41689566063dSJacob Faibussowitsch if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 41699566063dSJacob Faibussowitsch if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4170afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 41719566063dSJacob Faibussowitsch if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4172afb2bd1cSJunchao Zhang #endif 41739566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactor)); 41747f756511SDominic Meiser } 41753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41767f756511SDominic Meiser } 4177d460d7bfSJunchao Zhang #endif 41787f756511SDominic Meiser 4179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4180d71ae5a4SJacob Faibussowitsch { 41817f756511SDominic Meiser CsrMatrix *mat; 41827f756511SDominic Meiser 41837f756511SDominic Meiser PetscFunctionBegin; 41847f756511SDominic Meiser if (*matstruct) { 41857f756511SDominic Meiser if ((*matstruct)->mat) { 41867f756511SDominic Meiser if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4187afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4188afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4189afb2bd1cSJunchao Zhang #else 41907f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 41919566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4192afb2bd1cSJunchao Zhang #endif 41937f756511SDominic Meiser } else { 41947f756511SDominic Meiser mat = (CsrMatrix *)(*matstruct)->mat; 41953ba16761SJacob Faibussowitsch PetscCall(CsrMatrix_Destroy(&mat)); 41967f756511SDominic Meiser } 41977f756511SDominic Meiser } 41989566063dSJacob Faibussowitsch if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 41997f756511SDominic Meiser delete (*matstruct)->cprowIndices; 42009566063dSJacob Faibussowitsch if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 42019566063dSJacob Faibussowitsch if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 42029566063dSJacob Faibussowitsch if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4203afb2bd1cSJunchao Zhang 4204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4205afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 42069566063dSJacob Faibussowitsch if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4207fe5544b9SJunchao Zhang 4208afb2bd1cSJunchao Zhang for (int i = 0; i < 3; i++) { 4209afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 42109566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 42119566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 42129566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4213fe5544b9SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4214fe5544b9SJunchao Zhang if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4215fe5544b9SJunchao Zhang if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4216fe5544b9SJunchao Zhang #endif 4217afb2bd1cSJunchao Zhang } 4218afb2bd1cSJunchao Zhang } 4219afb2bd1cSJunchao Zhang #endif 42207f756511SDominic Meiser delete *matstruct; 42217e8381f9SStefano Zampini *matstruct = NULL; 42227f756511SDominic Meiser } 42233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42247f756511SDominic Meiser } 42257f756511SDominic Meiser 4226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4227d71ae5a4SJacob Faibussowitsch { 4228da112707SJunchao Zhang Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4229da112707SJunchao Zhang 42307f756511SDominic Meiser PetscFunctionBegin; 4231da112707SJunchao Zhang if (fs) { 4232b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4233da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4234da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4235da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4236da112707SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4237d460d7bfSJunchao Zhang delete fs->workVector; 4238d460d7bfSJunchao Zhang fs->workVector = NULL; 4239d460d7bfSJunchao Zhang #endif 4240da112707SJunchao Zhang delete fs->rpermIndices; 4241da112707SJunchao Zhang delete fs->cpermIndices; 4242da112707SJunchao Zhang fs->rpermIndices = NULL; 4243da112707SJunchao Zhang fs->cpermIndices = NULL; 4244da112707SJunchao Zhang fs->init_dev_prop = PETSC_FALSE; 4245b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4246da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4247da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx)); 424830807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 424930807b38SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4250da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->csrVal)); 4251d460d7bfSJunchao Zhang PetscCallCUDA(cudaFree(fs->diag)); 4252da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->X)); 4253da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->Y)); 425412ba2bc6SJunchao Zhang // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4255da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4256da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 425712ba2bc6SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4258da112707SJunchao Zhang PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4259da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4260da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4261da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4262da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4263da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4264da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4265da112707SJunchao Zhang PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4266da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4267da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4268da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4269da112707SJunchao Zhang PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4270d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrRowPtr_h)); 4271d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->csrVal_h)); 4272d460d7bfSJunchao Zhang PetscCall(PetscFree(fs->diag_h)); 427312ba2bc6SJunchao Zhang fs->createdTransposeSpSVDescr = PETSC_FALSE; 427412ba2bc6SJunchao Zhang fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4275da112707SJunchao Zhang #endif 4276ccdfe979SStefano Zampini } 42773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4278ccdfe979SStefano Zampini } 4279ccdfe979SStefano Zampini 4280d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4281d71ae5a4SJacob Faibussowitsch { 4282ccdfe979SStefano Zampini PetscFunctionBegin; 4283ccdfe979SStefano Zampini if (*trifactors) { 42849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4285f0173cd6SStefano Zampini PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 42869566063dSJacob Faibussowitsch PetscCall(PetscFree(*trifactors)); 42877f756511SDominic Meiser } 42883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 42897f756511SDominic Meiser } 42907e8381f9SStefano Zampini 42919371c9d4SSatish Balay struct IJCompare { 4292d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4293d71ae5a4SJacob Faibussowitsch { 42940b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 42950b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 42967e8381f9SStefano Zampini return false; 42977e8381f9SStefano Zampini } 42987e8381f9SStefano Zampini }; 42997e8381f9SStefano Zampini 430066976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4301d71ae5a4SJacob Faibussowitsch { 4302a49f1ed0SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4303a49f1ed0SStefano Zampini 4304a49f1ed0SStefano Zampini PetscFunctionBegin; 4305a49f1ed0SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 43063ba16761SJacob Faibussowitsch if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4307a49f1ed0SStefano Zampini if (destroy) { 43089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4309a49f1ed0SStefano Zampini delete cusp->csr2csc_i; 4310a49f1ed0SStefano Zampini cusp->csr2csc_i = NULL; 4311a49f1ed0SStefano Zampini } 43121a2c6b5cSJunchao Zhang A->transupdated = PETSC_FALSE; 43133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4314a49f1ed0SStefano Zampini } 4315a49f1ed0SStefano Zampini 431649abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4317d71ae5a4SJacob Faibussowitsch { 431849abdd8aSBarry Smith MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 43194d86920dSPierre Jolivet 43207e8381f9SStefano Zampini PetscFunctionBegin; 43212c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->perm)); 43222c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->jmap)); 43232c4ab24aSJunchao Zhang PetscCall(PetscFree(coo)); 43243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 43257e8381f9SStefano Zampini } 4326ed502f03SStefano Zampini 432766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4328d71ae5a4SJacob Faibussowitsch { 43292c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE; 43302c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST; 43312c4ab24aSJunchao Zhang PetscInt *i, *j; 433203e76207SPierre Jolivet PetscContainer container_h; 43332c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4334219fbbafSJunchao Zhang 4335219fbbafSJunchao Zhang PetscFunctionBegin; 43369566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 43372c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) { 43382c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE; 43392c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 43402c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43412c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 43422c4ab24aSJunchao Zhang } else { 43432c4ab24aSJunchao Zhang i = coo_i; 43442c4ab24aSJunchao Zhang j = coo_j; 4345219fbbafSJunchao Zhang } 4346219fbbafSJunchao Zhang 43472c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 43482c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j)); 4349cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 43502c4ab24aSJunchao Zhang // Create the GPU memory 43519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 43522c4ab24aSJunchao Zhang 43532c4ab24aSJunchao Zhang // Copy the COO struct to device 43542c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 43552c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 43562c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d)); 43572c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 43582c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 43592c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43602c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 43612c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 43622c4ab24aSJunchao Zhang 43632c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 436403e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 43653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4366219fbbafSJunchao Zhang } 4367219fbbafSJunchao Zhang 4368d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4369d71ae5a4SJacob Faibussowitsch { 4370219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4371219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 4372b6c38306SJunchao Zhang for (; i < nnz; i += grid_size) { 4373b6c38306SJunchao Zhang PetscScalar sum = 0.0; 4374b6c38306SJunchao Zhang for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4375b6c38306SJunchao Zhang a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4376b6c38306SJunchao Zhang } 4377219fbbafSJunchao Zhang } 4378219fbbafSJunchao Zhang 437966976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4380d71ae5a4SJacob Faibussowitsch { 4381219fbbafSJunchao Zhang Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4382219fbbafSJunchao Zhang Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4383219fbbafSJunchao Zhang PetscCount Annz = seq->nz; 4384219fbbafSJunchao Zhang PetscMemType memtype; 4385219fbbafSJunchao Zhang const PetscScalar *v1 = v; 4386219fbbafSJunchao Zhang PetscScalar *Aa; 43872c4ab24aSJunchao Zhang PetscContainer container; 43882c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo; 4389219fbbafSJunchao Zhang 4390219fbbafSJunchao Zhang PetscFunctionBegin; 43912c4ab24aSJunchao Zhang if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 43922c4ab24aSJunchao Zhang 43932c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 43942c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 43952c4ab24aSJunchao Zhang 43969566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 4397219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 43982c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 43992c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4400219fbbafSJunchao Zhang } 4401219fbbafSJunchao Zhang 44029566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 44039566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4404219fbbafSJunchao Zhang 440508bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 4406cbc6b225SStefano Zampini if (Annz) { 44076497c311SBarry Smith MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 44089566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 4409cbc6b225SStefano Zampini } 441008bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 4411219fbbafSJunchao Zhang 44129566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 44139566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4414219fbbafSJunchao Zhang 44159566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 44163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4417219fbbafSJunchao Zhang } 4418219fbbafSJunchao Zhang 44195b7e41feSStefano Zampini /*@C 44202ef1f0ffSBarry Smith MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 44215b7e41feSStefano Zampini 44222ef1f0ffSBarry Smith Not Collective 44235b7e41feSStefano Zampini 44245b7e41feSStefano Zampini Input Parameters: 44255b7e41feSStefano Zampini + A - the matrix 442611a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 44275b7e41feSStefano Zampini 44285b7e41feSStefano Zampini Output Parameters: 442953220ed8SBarry Smith + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 443053220ed8SBarry Smith - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 44315b7e41feSStefano Zampini 44325b7e41feSStefano Zampini Level: developer 44335b7e41feSStefano Zampini 443411a5261eSBarry Smith Note: 44355b7e41feSStefano Zampini When compressed is true, the CSR structure does not contain empty rows 44365b7e41feSStefano Zampini 44371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 44385b7e41feSStefano Zampini @*/ 4439d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4440d71ae5a4SJacob Faibussowitsch { 44415f101d05SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 44425f101d05SStefano Zampini CsrMatrix *csr; 44435f101d05SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 44445f101d05SStefano Zampini 44455f101d05SStefano Zampini PetscFunctionBegin; 44465f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44473ba16761SJacob Faibussowitsch if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 44485f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4449aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 44509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 445128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 44525f101d05SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 44535f101d05SStefano Zampini if (i) { 44545f101d05SStefano Zampini if (!compressed && a->compressedrow.use) { /* need full row offset */ 44555f101d05SStefano Zampini if (!cusp->rowoffsets_gpu) { 44565f101d05SStefano Zampini cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 44575f101d05SStefano Zampini cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 44589566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 44595f101d05SStefano Zampini } 44605f101d05SStefano Zampini *i = cusp->rowoffsets_gpu->data().get(); 44615f101d05SStefano Zampini } else *i = csr->row_offsets->data().get(); 44625f101d05SStefano Zampini } 44635f101d05SStefano Zampini if (j) *j = csr->column_indices->data().get(); 44643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 44655f101d05SStefano Zampini } 44665f101d05SStefano Zampini 44675b7e41feSStefano Zampini /*@C 44682ef1f0ffSBarry Smith MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 44695b7e41feSStefano Zampini 44702ef1f0ffSBarry Smith Not Collective 44715b7e41feSStefano Zampini 44725b7e41feSStefano Zampini Input Parameters: 44735b7e41feSStefano Zampini + A - the matrix 44742ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 447520f4b53cSBarry Smith . i - the CSR row pointers 447620f4b53cSBarry Smith - j - the CSR column indices 44775b7e41feSStefano Zampini 44785b7e41feSStefano Zampini Level: developer 44795b7e41feSStefano Zampini 44801cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 44815b7e41feSStefano Zampini @*/ 448220f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4483d71ae5a4SJacob Faibussowitsch { 44845f101d05SStefano Zampini PetscFunctionBegin; 44855f101d05SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 44865f101d05SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 44875f101d05SStefano Zampini if (i) *i = NULL; 44885f101d05SStefano Zampini if (j) *j = NULL; 448920f4b53cSBarry Smith (void)compressed; 44903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 44915f101d05SStefano Zampini } 44925f101d05SStefano Zampini 44935b7e41feSStefano Zampini /*@C 449453220ed8SBarry Smith MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored 44955b7e41feSStefano Zampini 44965b7e41feSStefano Zampini Not Collective 44975b7e41feSStefano Zampini 44985b7e41feSStefano Zampini Input Parameter: 449911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45005b7e41feSStefano Zampini 45015b7e41feSStefano Zampini Output Parameter: 45025b7e41feSStefano Zampini . a - pointer to the device data 45035b7e41feSStefano Zampini 45045b7e41feSStefano Zampini Level: developer 45055b7e41feSStefano Zampini 450611a5261eSBarry Smith Note: 450753220ed8SBarry Smith Will trigger host-to-device copies if the most up-to-date matrix data is on the host 45085b7e41feSStefano Zampini 45091cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 45105b7e41feSStefano Zampini @*/ 4511d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4512d71ae5a4SJacob Faibussowitsch { 4513ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4514ed502f03SStefano Zampini CsrMatrix *csr; 4515ed502f03SStefano Zampini 4516ed502f03SStefano Zampini PetscFunctionBegin; 4517ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45184f572ea9SToby Isaac PetscAssertPointer(a, 2); 4519ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4520aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 452228b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4523ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 452428b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4525ed502f03SStefano Zampini *a = csr->values->data().get(); 45263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4527ed502f03SStefano Zampini } 4528ed502f03SStefano Zampini 45295b7e41feSStefano Zampini /*@C 453011a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 45315b7e41feSStefano Zampini 45325b7e41feSStefano Zampini Not Collective 45335b7e41feSStefano Zampini 45342ef1f0ffSBarry Smith Input Parameters: 45352ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 45362ef1f0ffSBarry Smith - a - pointer to the device data 45375b7e41feSStefano Zampini 45385b7e41feSStefano Zampini Level: developer 45395b7e41feSStefano Zampini 45401cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 45415b7e41feSStefano Zampini @*/ 4542d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4543d71ae5a4SJacob Faibussowitsch { 4544ed502f03SStefano Zampini PetscFunctionBegin; 4545ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45464f572ea9SToby Isaac PetscAssertPointer(a, 2); 4547ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4548ed502f03SStefano Zampini *a = NULL; 45493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4550ed502f03SStefano Zampini } 4551ed502f03SStefano Zampini 45525b7e41feSStefano Zampini /*@C 455311a5261eSBarry Smith MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 45545b7e41feSStefano Zampini 45555b7e41feSStefano Zampini Not Collective 45565b7e41feSStefano Zampini 45575b7e41feSStefano Zampini Input Parameter: 455811a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 45595b7e41feSStefano Zampini 45605b7e41feSStefano Zampini Output Parameter: 45615b7e41feSStefano Zampini . a - pointer to the device data 45625b7e41feSStefano Zampini 45635b7e41feSStefano Zampini Level: developer 45645b7e41feSStefano Zampini 456511a5261eSBarry Smith Note: 456653220ed8SBarry Smith Will trigger host-to-device copies if the most up-to-date matrix data is on the host 45675b7e41feSStefano Zampini 45681cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 45695b7e41feSStefano Zampini @*/ 4570d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4571d71ae5a4SJacob Faibussowitsch { 4572039c6fbaSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4573039c6fbaSStefano Zampini CsrMatrix *csr; 4574039c6fbaSStefano Zampini 4575039c6fbaSStefano Zampini PetscFunctionBegin; 4576039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 45774f572ea9SToby Isaac PetscAssertPointer(a, 2); 4578039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4579aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 45809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 458128b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4582039c6fbaSStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 458328b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4584039c6fbaSStefano Zampini *a = csr->values->data().get(); 4585039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 45869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 45873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4588039c6fbaSStefano Zampini } 45895b7e41feSStefano Zampini /*@C 459011a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4591039c6fbaSStefano Zampini 45925b7e41feSStefano Zampini Not Collective 45935b7e41feSStefano Zampini 45942ef1f0ffSBarry Smith Input Parameters: 45952ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 45962ef1f0ffSBarry Smith - a - pointer to the device data 45975b7e41feSStefano Zampini 45985b7e41feSStefano Zampini Level: developer 45995b7e41feSStefano Zampini 46001cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 46015b7e41feSStefano Zampini @*/ 4602d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4603d71ae5a4SJacob Faibussowitsch { 4604039c6fbaSStefano Zampini PetscFunctionBegin; 4605039c6fbaSStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46064f572ea9SToby Isaac PetscAssertPointer(a, 2); 4607039c6fbaSStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46099566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4610039c6fbaSStefano Zampini *a = NULL; 46113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4612039c6fbaSStefano Zampini } 4613039c6fbaSStefano Zampini 46145b7e41feSStefano Zampini /*@C 461511a5261eSBarry Smith MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 46165b7e41feSStefano Zampini 46175b7e41feSStefano Zampini Not Collective 46185b7e41feSStefano Zampini 46195b7e41feSStefano Zampini Input Parameter: 462011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix 46215b7e41feSStefano Zampini 46225b7e41feSStefano Zampini Output Parameter: 46235b7e41feSStefano Zampini . a - pointer to the device data 46245b7e41feSStefano Zampini 46255b7e41feSStefano Zampini Level: developer 46265b7e41feSStefano Zampini 462711a5261eSBarry Smith Note: 462853220ed8SBarry Smith Does not trigger any host to device copies. 462953220ed8SBarry Smith 463053220ed8SBarry Smith It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current 46315b7e41feSStefano Zampini 46321cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 46335b7e41feSStefano Zampini @*/ 4634d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4635d71ae5a4SJacob Faibussowitsch { 4636ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4637ed502f03SStefano Zampini CsrMatrix *csr; 4638ed502f03SStefano Zampini 4639ed502f03SStefano Zampini PetscFunctionBegin; 4640ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46414f572ea9SToby Isaac PetscAssertPointer(a, 2); 4642ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4643aed4548fSBarry Smith PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 464428b400f6SJacob Faibussowitsch PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4645ed502f03SStefano Zampini csr = (CsrMatrix *)cusp->mat->mat; 464628b400f6SJacob Faibussowitsch PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4647ed502f03SStefano Zampini *a = csr->values->data().get(); 4648039c6fbaSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 46499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 46503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4651ed502f03SStefano Zampini } 4652ed502f03SStefano Zampini 46535b7e41feSStefano Zampini /*@C 465411a5261eSBarry Smith MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 46555b7e41feSStefano Zampini 46565b7e41feSStefano Zampini Not Collective 46575b7e41feSStefano Zampini 46582ef1f0ffSBarry Smith Input Parameters: 46592ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix 46602ef1f0ffSBarry Smith - a - pointer to the device data 46615b7e41feSStefano Zampini 46625b7e41feSStefano Zampini Level: developer 46635b7e41feSStefano Zampini 46641cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 46655b7e41feSStefano Zampini @*/ 4666d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4667d71ae5a4SJacob Faibussowitsch { 4668ed502f03SStefano Zampini PetscFunctionBegin; 4669ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 46704f572ea9SToby Isaac PetscAssertPointer(a, 2); 4671ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 46729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 46739566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4674ed502f03SStefano Zampini *a = NULL; 46753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4676ed502f03SStefano Zampini } 4677ed502f03SStefano Zampini 46789371c9d4SSatish Balay struct IJCompare4 { 4679d71ae5a4SJacob Faibussowitsch __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4680d71ae5a4SJacob Faibussowitsch { 46810b156cc8SJunchao Zhang if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 46820b156cc8SJunchao Zhang if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4683ed502f03SStefano Zampini return false; 4684ed502f03SStefano Zampini } 4685ed502f03SStefano Zampini }; 4686ed502f03SStefano Zampini 46879371c9d4SSatish Balay struct Shift { 4688ed502f03SStefano Zampini int _shift; 4689ed502f03SStefano Zampini 4690ed502f03SStefano Zampini Shift(int shift) : _shift(shift) { } 46919371c9d4SSatish Balay __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4692ed502f03SStefano Zampini }; 4693ed502f03SStefano Zampini 469421afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4695d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4696d71ae5a4SJacob Faibussowitsch { 4697ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4698ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4699ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4700ed502f03SStefano Zampini CsrMatrix *Acsr, *Bcsr, *Ccsr; 4701ed502f03SStefano Zampini PetscInt Annz, Bnnz; 4702ed502f03SStefano Zampini cusparseStatus_t stat; 4703ed502f03SStefano Zampini PetscInt i, m, n, zero = 0; 4704ed502f03SStefano Zampini 4705ed502f03SStefano Zampini PetscFunctionBegin; 4706ed502f03SStefano Zampini PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4707ed502f03SStefano Zampini PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 47084f572ea9SToby Isaac PetscAssertPointer(C, 4); 4709ed502f03SStefano Zampini PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4710ed502f03SStefano Zampini PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 47115f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 471208401ef6SPierre Jolivet PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4713aed4548fSBarry Smith PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4714aed4548fSBarry Smith PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4715ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4716ed502f03SStefano Zampini m = A->rmap->n; 4717ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 47189566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, C)); 47199566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*C, m, n, m, n)); 47209566063dSJacob Faibussowitsch PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4721ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4722ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4723ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4724ed502f03SStefano Zampini Ccsr = new CsrMatrix; 4725ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 4726ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 4727ed502f03SStefano Zampini c->compressedrow.nrows = 0; 4728ed502f03SStefano Zampini c->compressedrow.i = NULL; 4729ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 4730ed502f03SStefano Zampini Ccusp->workVector = NULL; 4731ed502f03SStefano Zampini Ccusp->nrows = m; 4732ed502f03SStefano Zampini Ccusp->mat = Cmat; 4733ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 4734ed502f03SStefano Zampini Ccsr->num_rows = m; 4735ed502f03SStefano Zampini Ccsr->num_cols = n; 47369566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 47379566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 47389566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4739f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4740f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4741f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 47429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 47459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 47469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 474728b400f6SJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 474828b400f6SJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4749ed502f03SStefano Zampini 4750ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4751ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4752ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 4753ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 4754ed502f03SStefano Zampini c->nz = Annz + Bnnz; 4755ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4756ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4757ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 4758ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 47592c4ab24aSJunchao Zhang Ccusp->coords = new THRUSTINTARRAY(c->nz); 4760ed502f03SStefano Zampini if (c->nz) { 47612ed87e7eSStefano Zampini auto Acoo = new THRUSTINTARRAY32(Annz); 47622ed87e7eSStefano Zampini auto Bcoo = new THRUSTINTARRAY32(Bnnz); 47632ed87e7eSStefano Zampini auto Ccoo = new THRUSTINTARRAY32(c->nz); 47642ed87e7eSStefano Zampini THRUSTINTARRAY32 *Aroff, *Broff; 47652ed87e7eSStefano Zampini 4766ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 4767ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 4768ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4769ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 47709566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4771ed502f03SStefano Zampini } 47722ed87e7eSStefano Zampini Aroff = Acusp->rowoffsets_gpu; 47732ed87e7eSStefano Zampini } else Aroff = Acsr->row_offsets; 4774ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 4775ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 4776ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4777ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 47789566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4779ed502f03SStefano Zampini } 47802ed87e7eSStefano Zampini Broff = Bcusp->rowoffsets_gpu; 47812ed87e7eSStefano Zampini } else Broff = Bcsr->row_offsets; 47829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 47839371c9d4SSatish Balay stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47849371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47859371c9d4SSatish Balay stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 47869371c9d4SSatish Balay PetscCallCUSPARSE(stat); 47872ed87e7eSStefano Zampini /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 47882ed87e7eSStefano Zampini auto Aperm = thrust::make_constant_iterator(1); 47892ed87e7eSStefano Zampini auto Bperm = thrust::make_constant_iterator(0); 47908909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4791ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4792ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 47938909a122SStefano Zampini #else 47948909a122SStefano Zampini /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 47958909a122SStefano Zampini auto Bcib = Bcsr->column_indices->begin(); 47968909a122SStefano Zampini auto Bcie = Bcsr->column_indices->end(); 47978909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 47988909a122SStefano Zampini #endif 47992ed87e7eSStefano Zampini auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 48002ed87e7eSStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 48012ed87e7eSStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 48022ed87e7eSStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 48032ed87e7eSStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 48042ed87e7eSStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 48052c4ab24aSJunchao Zhang auto p1 = Ccusp->coords->begin(); 48062c4ab24aSJunchao Zhang auto p2 = Ccusp->coords->begin(); 4807ed502f03SStefano Zampini thrust::advance(p2, Annz); 4808792fecdfSBarry Smith PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 48098909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 48108909a122SStefano Zampini thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 48118909a122SStefano Zampini #endif 48122ed87e7eSStefano Zampini auto cci = thrust::make_counting_iterator(zero); 48132ed87e7eSStefano Zampini auto cce = thrust::make_counting_iterator(c->nz); 48142ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0 4815792fecdfSBarry Smith PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 48162ed87e7eSStefano Zampini #else 481759c3d2bbSPierre Jolivet #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 48182ed87e7eSStefano Zampini auto pred = thrust::identity<int>(); 481959c3d2bbSPierre Jolivet #else 482059c3d2bbSPierre Jolivet auto pred = cuda::std::identity(); 482159c3d2bbSPierre Jolivet #endif 4822792fecdfSBarry Smith PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4823792fecdfSBarry Smith PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 48242ed87e7eSStefano Zampini #endif 48259371c9d4SSatish Balay stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 48269371c9d4SSatish Balay PetscCallCUSPARSE(stat); 48279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 48282ed87e7eSStefano Zampini delete wPerm; 48292ed87e7eSStefano Zampini delete Acoo; 48302ed87e7eSStefano Zampini delete Bcoo; 48312ed87e7eSStefano Zampini delete Ccoo; 4832ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48339371c9d4SSatish Balay stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48349371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4835ed502f03SStefano Zampini #endif 48361a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 48379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 48389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4839ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4840ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4841ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 4842ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4843ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4844ed502f03SStefano Zampini 48451a2c6b5cSJunchao Zhang (*C)->form_explicit_transpose = PETSC_TRUE; 48461a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4847a49f1ed0SStefano Zampini Ccusp->rowoffsets_gpu = NULL; 4848ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 4849ed502f03SStefano Zampini CmatT->mat = CcsrT; 4850ed502f03SStefano Zampini CcsrT->num_rows = n; 4851ed502f03SStefano Zampini CcsrT->num_cols = m; 4852ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 4853ed502f03SStefano Zampini 4854ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4855ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4856ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 4857ed502f03SStefano Zampini 48589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4859ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 4860ed502f03SStefano Zampini if (AT) { 4861ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4862ed502f03SStefano Zampini thrust::advance(rT, -1); 4863ed502f03SStefano Zampini } 4864ed502f03SStefano Zampini if (BT) { 4865ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4866ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4867ed502f03SStefano Zampini thrust::copy(titb, tite, rT); 4868ed502f03SStefano Zampini } 4869ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 4870ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4871ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4872ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4873ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4874ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 48759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4876ed502f03SStefano Zampini 48779566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 48789566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 48799566063dSJacob Faibussowitsch PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4880f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4881f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4882f4f49eeaSPierre Jolivet PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 48839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 48859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4886ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 48879371c9d4SSatish Balay stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 48889371c9d4SSatish Balay PetscCallCUSPARSE(stat); 4889ed502f03SStefano Zampini #endif 4890ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 4891ed502f03SStefano Zampini } 4892ed502f03SStefano Zampini } 4893ed502f03SStefano Zampini 4894ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 48959f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 48969f0612e4SBarry Smith PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4897ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 48987de69702SBarry Smith if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4899ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4900ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4901ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 4902ed502f03SStefano Zampini jj = *Ccsr->column_indices; 49039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4905ed502f03SStefano Zampini } else { 49069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 49079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4908ed502f03SStefano Zampini } 49099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 49109566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->ilen)); 49119566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &c->imax)); 4912ed502f03SStefano Zampini c->maxnz = c->nz; 4913ed502f03SStefano Zampini c->nonzerorowcnt = 0; 4914ed502f03SStefano Zampini c->rmax = 0; 4915ed502f03SStefano Zampini for (i = 0; i < m; i++) { 4916ed502f03SStefano Zampini const PetscInt nn = c->i[i + 1] - c->i[i]; 4917ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 4918ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 4919ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax, nn); 4920ed502f03SStefano Zampini } 49219566063dSJacob Faibussowitsch PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 49229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(c->nz, &c->a)); 4923ed502f03SStefano Zampini (*C)->nonzerostate++; 49249566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->rmap)); 49259566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp((*C)->cmap)); 4926ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 4927ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 4928ed502f03SStefano Zampini } else { 492908401ef6SPierre Jolivet PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4930ed502f03SStefano Zampini c = (Mat_SeqAIJ *)(*C)->data; 4931ed502f03SStefano Zampini if (c->nz) { 4932ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 49332c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4934aed4548fSBarry Smith PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 493508401ef6SPierre Jolivet PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 49369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 49379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 49385f80ce2aSJacob Faibussowitsch PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 49395f80ce2aSJacob Faibussowitsch PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4940ed502f03SStefano Zampini Acsr = (CsrMatrix *)Acusp->mat->mat; 4941ed502f03SStefano Zampini Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4942ed502f03SStefano Zampini Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4943aed4548fSBarry Smith PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4944aed4548fSBarry Smith PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4945aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4946aed4548fSBarry Smith PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 49472c4ab24aSJunchao Zhang PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 49482c4ab24aSJunchao Zhang auto pmid = Ccusp->coords->begin(); 4949ed502f03SStefano Zampini thrust::advance(pmid, Acsr->num_entries); 49509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 49512c4ab24aSJunchao Zhang auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 49529371c9d4SSatish Balay auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4953ed502f03SStefano Zampini thrust::for_each(zibait, zieait, VecCUDAEquals()); 49549371c9d4SSatish Balay auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 49552c4ab24aSJunchao Zhang auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4956ed502f03SStefano Zampini thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 49579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 49581a2c6b5cSJunchao Zhang if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 49595f80ce2aSJacob Faibussowitsch PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4960ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4961ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4962ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4963ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 4964ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4965ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4966ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 49671a2c6b5cSJunchao Zhang (*C)->transupdated = PETSC_TRUE; 4968ed502f03SStefano Zampini } 49699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4970ed502f03SStefano Zampini } 4971ed502f03SStefano Zampini } 49729566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 4973ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4974ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4975ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 49763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4977ed502f03SStefano Zampini } 4978c215019aSStefano Zampini 4979d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 4980d71ae5a4SJacob Faibussowitsch { 4981c215019aSStefano Zampini bool dmem; 4982c215019aSStefano Zampini const PetscScalar *av; 4983c215019aSStefano Zampini 4984c215019aSStefano Zampini PetscFunctionBegin; 4985c215019aSStefano Zampini dmem = isCudaMem(v); 49869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 4987c215019aSStefano Zampini if (n && idx) { 4988c215019aSStefano Zampini THRUSTINTARRAY widx(n); 4989c215019aSStefano Zampini widx.assign(idx, idx + n); 49909566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 4991c215019aSStefano Zampini 4992c215019aSStefano Zampini THRUSTARRAY *w = NULL; 4993c215019aSStefano Zampini thrust::device_ptr<PetscScalar> dv; 4994c215019aSStefano Zampini if (dmem) { 4995c215019aSStefano Zampini dv = thrust::device_pointer_cast(v); 4996c215019aSStefano Zampini } else { 4997c215019aSStefano Zampini w = new THRUSTARRAY(n); 4998c215019aSStefano Zampini dv = w->data(); 4999c215019aSStefano Zampini } 5000c215019aSStefano Zampini thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5001c215019aSStefano Zampini 5002c215019aSStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5003c215019aSStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5004c215019aSStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 500548a46eb9SPierre Jolivet if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5006c215019aSStefano Zampini delete w; 5007c215019aSStefano Zampini } else { 50089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5009c215019aSStefano Zampini } 50109566063dSJacob Faibussowitsch if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 50119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 50123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5013c215019aSStefano Zampini } 5014