xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 204a0e31e6a72ac0b2cf53cb7803417cbf6c6b4e) !
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30afb2bd1cSJunchao Zhang 
31afb2bd1cSJunchao Zhang   typedef enum {
32afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
36afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
37afb2bd1cSJunchao Zhang 
38afb2bd1cSJunchao Zhang   typedef enum {
39afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
51afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
52afb2bd1cSJunchao Zhang 
53afb2bd1cSJunchao Zhang   typedef enum {
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5535cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
57afb2bd1cSJunchao Zhang   */
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61afb2bd1cSJunchao Zhang #endif
629ae82921SPaul Mullowney 
63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
716fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73d460d7bfSJunchao Zhang #endif
74ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject);
75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
849ae82921SPaul Mullowney 
857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
882c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
897f756511SDominic Meiser 
9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9257181aedSStefano Zampini 
93c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96c215019aSStefano Zampini 
97d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98d71ae5a4SJacob Faibussowitsch {
99aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1006e111a19SKarl Rupp 
101ca45077fSPaul Mullowney   PetscFunctionBegin;
102ca45077fSPaul Mullowney   switch (op) {
103d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
104d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
105d71ae5a4SJacob Faibussowitsch     break;
106d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
107d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
108d71ae5a4SJacob Faibussowitsch     break;
109d71ae5a4SJacob Faibussowitsch   default:
110d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111ca45077fSPaul Mullowney   }
1123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
113ca45077fSPaul Mullowney }
1149ae82921SPaul Mullowney 
115e057df02SPaul Mullowney /*@
11611a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11711a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
11811a5261eSBarry Smith 
119e057df02SPaul Mullowney   Not Collective
120e057df02SPaul Mullowney 
121e057df02SPaul Mullowney   Input Parameters:
12211a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
1232ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1242ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12511a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126e057df02SPaul Mullowney 
127e057df02SPaul Mullowney   Level: intermediate
128e057df02SPaul Mullowney 
129fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130e057df02SPaul Mullowney @*/
131d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132d71ae5a4SJacob Faibussowitsch {
133e057df02SPaul Mullowney   PetscFunctionBegin;
134e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137e057df02SPaul Mullowney }
138e057df02SPaul Mullowney 
139d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142365b711fSMark Adams 
143365b711fSMark Adams   PetscFunctionBegin;
144365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
146365b711fSMark Adams }
147365b711fSMark Adams 
148365b711fSMark Adams /*@
14911a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150365b711fSMark Adams 
151365b711fSMark Adams   Input Parameters:
15211a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
15311a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
154365b711fSMark Adams 
1552ef1f0ffSBarry Smith   Level: intermediate
156365b711fSMark Adams 
15711a5261eSBarry Smith   Note:
158365b711fSMark Adams   The cuSparse LU solver currently computes the factors with the built-in CPU method
159365b711fSMark Adams   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161365b711fSMark Adams 
1621cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163365b711fSMark Adams @*/
164d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165d71ae5a4SJacob Faibussowitsch {
166365b711fSMark Adams   PetscFunctionBegin;
167365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
170365b711fSMark Adams }
171365b711fSMark Adams 
17266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173d71ae5a4SJacob Faibussowitsch {
174e6e9a74fSStefano Zampini   PetscFunctionBegin;
1751a2c6b5cSJunchao Zhang   switch (op) {
1761a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1771a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1789566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1791a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1801a2c6b5cSJunchao Zhang     break;
181d71ae5a4SJacob Faibussowitsch   default:
182d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183d71ae5a4SJacob Faibussowitsch     break;
184e6e9a74fSStefano Zampini   }
1853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
186e6e9a74fSStefano Zampini }
187e6e9a74fSStefano Zampini 
188ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
189d71ae5a4SJacob Faibussowitsch {
190e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1919ae82921SPaul Mullowney   PetscBool                flg;
192a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1936e111a19SKarl Rupp 
1949ae82921SPaul Mullowney   PetscFunctionBegin;
195d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1969ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1979371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1989566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199afb2bd1cSJunchao Zhang 
2009371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2039566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2059371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209a435da06SStefano Zampini   #else
210aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211a435da06SStefano Zampini   #endif
2129371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214afb2bd1cSJunchao Zhang 
2159371c9d4SSatish Balay     PetscCall(
2169371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218afb2bd1cSJunchao Zhang #endif
2194c87dfd4SPaul Mullowney   }
220d0609cedSBarry Smith   PetscOptionsHeadEnd();
2213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2229ae82921SPaul Mullowney }
2239ae82921SPaul Mullowney 
224b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226d460d7bfSJunchao Zhang {
227d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
229d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
232d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
233d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
234d460d7bfSJunchao Zhang 
235d460d7bfSJunchao Zhang   PetscFunctionBegin;
236d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
241d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
243d460d7bfSJunchao Zhang       Mi[0] = 0;
244d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
245d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
246d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
251d460d7bfSJunchao Zhang       }
252d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
253f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258d460d7bfSJunchao Zhang 
259d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267d460d7bfSJunchao Zhang 
268d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271d460d7bfSJunchao Zhang 
272d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
273d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277d460d7bfSJunchao Zhang 
278d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
279f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281d460d7bfSJunchao Zhang 
282d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284d460d7bfSJunchao Zhang 
285d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292d460d7bfSJunchao Zhang 
293d460d7bfSJunchao Zhang       // Record for reuse
294d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
295d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
296d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
297d460d7bfSJunchao Zhang     }
298d460d7bfSJunchao Zhang     // Copy the value
299d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
300d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
301d460d7bfSJunchao Zhang     Mnz = Mi[m];
302d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
303d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
304d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308d460d7bfSJunchao Zhang     }
309d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310d460d7bfSJunchao Zhang 
311*204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
312*204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed?
313*204a0e31SJunchao Zhang       // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer"
314*204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
315*204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
316*204a0e31SJunchao Zhang     } else
317*204a0e31SJunchao Zhang   #endif
318*204a0e31SJunchao Zhang     {
319d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
320d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
321d460d7bfSJunchao Zhang 
322d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
323*204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis          = PETSC_TRUE;
324d460d7bfSJunchao Zhang       fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
325d460d7bfSJunchao Zhang     }
326*204a0e31SJunchao Zhang   }
327d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
328d460d7bfSJunchao Zhang }
329d460d7bfSJunchao Zhang #else
330d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
331d71ae5a4SJacob Faibussowitsch {
3329ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3339ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3349ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
335aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3369ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3379ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3389ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3399ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney   PetscFunctionBegin;
3423ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
343c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3449ae82921SPaul Mullowney     try {
3459ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3469ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
347da79fbbcSStefano Zampini       if (!loTriFactor) {
3482cbc15d9SMark         PetscScalar *AALo;
3492cbc15d9SMark 
3509566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3519ae82921SPaul Mullowney 
3529ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3559ae82921SPaul Mullowney 
3569ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3579ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3589ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3599ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3609ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3619ae82921SPaul Mullowney         v         = aa;
3629ae82921SPaul Mullowney         vi        = aj;
3639ae82921SPaul Mullowney         offset    = 1;
3649ae82921SPaul Mullowney         rowOffset = 1;
3659ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3669ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
367e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3689ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3699ae82921SPaul Mullowney           rowOffset += nz + 1;
3709ae82921SPaul Mullowney 
371f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
372f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3739ae82921SPaul Mullowney 
3749ae82921SPaul Mullowney           offset += nz;
3759ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3769ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3779ae82921SPaul Mullowney           offset += 1;
3789ae82921SPaul Mullowney 
3799ae82921SPaul Mullowney           v += nz;
3809ae82921SPaul Mullowney           vi += nz;
3819ae82921SPaul Mullowney         }
3822205254eSKarl Rupp 
383aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3849566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
385da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
386aa372e3fSPaul Mullowney         /* Create the matrix description */
3879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3891b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
391afb2bd1cSJunchao Zhang   #else
3929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
393afb2bd1cSJunchao Zhang   #endif
3949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney         /* set the operation */
398aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
399aa372e3fSPaul Mullowney 
400aa372e3fSPaul Mullowney         /* set the matrix */
401aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
402aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
403aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
405aa372e3fSPaul Mullowney 
406aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
407aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
408aa372e3fSPaul Mullowney 
409aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
410aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
411aa372e3fSPaul Mullowney 
412aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
413aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
414aa372e3fSPaul Mullowney 
415afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4169566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
417261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
4181b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4199371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4209371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
4219566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
422afb2bd1cSJunchao Zhang   #endif
423afb2bd1cSJunchao Zhang 
424aa372e3fSPaul Mullowney         /* perform the solve analysis */
4259371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4269f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4279566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4289566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
429aa372e3fSPaul Mullowney 
430da79fbbcSStefano Zampini         /* assign the pointer */
431aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4322cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4339566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4349566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4359566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
436da79fbbcSStefano Zampini       } else { /* update values only */
43748a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
438da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4392cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
440da79fbbcSStefano Zampini         v                    = aa;
441da79fbbcSStefano Zampini         vi                   = aj;
442da79fbbcSStefano Zampini         offset               = 1;
443da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
444da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
445f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
446da79fbbcSStefano Zampini           offset += nz;
4472cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
448da79fbbcSStefano Zampini           offset += 1;
449da79fbbcSStefano Zampini           v += nz;
450da79fbbcSStefano Zampini         }
4512cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
453da79fbbcSStefano Zampini       }
454d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
455d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
456d71ae5a4SJacob Faibussowitsch     }
4579ae82921SPaul Mullowney   }
4583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4599ae82921SPaul Mullowney }
4609ae82921SPaul Mullowney 
461d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
462d71ae5a4SJacob Faibussowitsch {
4639ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4649ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4659ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
466aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4679ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4689ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4699ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4709ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4719ae82921SPaul Mullowney 
4729ae82921SPaul Mullowney   PetscFunctionBegin;
4733ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
474c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4759ae82921SPaul Mullowney     try {
4769ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4779ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
478da79fbbcSStefano Zampini       if (!upTriFactor) {
4792cbc15d9SMark         PetscScalar *AAUp;
4802cbc15d9SMark 
4819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4822cbc15d9SMark 
4839ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4869ae82921SPaul Mullowney 
4879ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4889ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4899ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4909ae82921SPaul Mullowney         offset  = nzUpper;
4919ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4929ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4939ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4949ae82921SPaul Mullowney 
495e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4969ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4979ae82921SPaul Mullowney 
498e057df02SPaul Mullowney           /* decrement the offset */
4999ae82921SPaul Mullowney           offset -= (nz + 1);
5009ae82921SPaul Mullowney 
501e057df02SPaul Mullowney           /* first, set the diagonal elements */
5029ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
50309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
5049ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
5059ae82921SPaul Mullowney 
506f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
507f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
5089ae82921SPaul Mullowney         }
5092205254eSKarl Rupp 
510aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5119566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
512da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5132205254eSKarl Rupp 
514aa372e3fSPaul Mullowney         /* Create the matrix description */
5159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5171b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
519afb2bd1cSJunchao Zhang   #else
5209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
521afb2bd1cSJunchao Zhang   #endif
5229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
524aa372e3fSPaul Mullowney 
525aa372e3fSPaul Mullowney         /* set the operation */
526aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
527aa372e3fSPaul Mullowney 
528aa372e3fSPaul Mullowney         /* set the matrix */
529aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
530aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
531aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
532aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
533aa372e3fSPaul Mullowney 
534aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
535aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
536aa372e3fSPaul Mullowney 
537aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
538aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
539aa372e3fSPaul Mullowney 
540aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
541aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
542aa372e3fSPaul Mullowney 
543afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5449566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
545261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5461b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5479371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5489371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5499566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
550afb2bd1cSJunchao Zhang   #endif
551afb2bd1cSJunchao Zhang 
552aa372e3fSPaul Mullowney         /* perform the solve analysis */
5539371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5549f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5559f7ba44dSJacob Faibussowitsch 
5569566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5579566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
558aa372e3fSPaul Mullowney 
559da79fbbcSStefano Zampini         /* assign the pointer */
560aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5612cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5639566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5649566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
565da79fbbcSStefano Zampini       } else {
56648a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
567da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
568da79fbbcSStefano Zampini         offset = nzUpper;
569da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
570da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
571da79fbbcSStefano Zampini 
572da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
573da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
574da79fbbcSStefano Zampini 
575da79fbbcSStefano Zampini           /* decrement the offset */
576da79fbbcSStefano Zampini           offset -= (nz + 1);
577da79fbbcSStefano Zampini 
578da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5792cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
580f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
581da79fbbcSStefano Zampini         }
5822cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5839566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
584da79fbbcSStefano Zampini       }
585d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
586d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
587d71ae5a4SJacob Faibussowitsch     }
5889ae82921SPaul Mullowney   }
5893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5909ae82921SPaul Mullowney }
591d460d7bfSJunchao Zhang #endif
5929ae82921SPaul Mullowney 
593d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
594d71ae5a4SJacob Faibussowitsch {
5959ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5969ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
597c9e33d71SJunchao Zhang   IS                            isrow = a->row, isicol = a->icol;
5989ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5999ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
6009ae82921SPaul Mullowney 
6019ae82921SPaul Mullowney   PetscFunctionBegin;
60228b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
603b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
604d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
605d460d7bfSJunchao Zhang #else
6069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
6079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
608ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
609d460d7bfSJunchao Zhang #endif
610d460d7bfSJunchao Zhang 
611aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
6129ae82921SPaul Mullowney 
613d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
614e057df02SPaul Mullowney   /* lower triangular indices */
6159566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
616da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
617da79fbbcSStefano Zampini     const PetscInt *r;
618da79fbbcSStefano Zampini 
6199566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
620aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
621aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
6229566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6239566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
624da79fbbcSStefano Zampini   }
6259ae82921SPaul Mullowney 
626e057df02SPaul Mullowney   /* upper triangular indices */
627c9e33d71SJunchao Zhang   PetscCall(ISIdentity(isicol, &col_identity));
628da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
629da79fbbcSStefano Zampini     const PetscInt *c;
630da79fbbcSStefano Zampini 
631c9e33d71SJunchao Zhang     PetscCall(ISGetIndices(isicol, &c));
632aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
633aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
634c9e33d71SJunchao Zhang     PetscCall(ISRestoreIndices(isicol, &c));
6359566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
636da79fbbcSStefano Zampini   }
6373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6389ae82921SPaul Mullowney }
6399ae82921SPaul Mullowney 
640b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
641d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
642d460d7bfSJunchao Zhang {
643d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
644d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
645d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
646d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
647d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
648d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
649d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
650d460d7bfSJunchao Zhang 
651d460d7bfSJunchao Zhang   PetscFunctionBegin;
652d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
653d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
654d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
655d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
656d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
657d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
658d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
659d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
660d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
661d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
662d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
663d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
664d460d7bfSJunchao Zhang       }
665d460d7bfSJunchao Zhang       // Copy M (U) from host to device
666f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
667f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
668f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
669f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
670d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
671d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
672d460d7bfSJunchao Zhang 
673d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
674d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
675d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
676d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
677d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
678d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
679d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
680d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
681d460d7bfSJunchao Zhang 
682d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
683d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
684d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
685d460d7bfSJunchao Zhang 
686d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
687f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
688f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
689d460d7bfSJunchao Zhang 
690d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
691d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
692d460d7bfSJunchao Zhang 
693d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
694d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
695d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
696d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
697d460d7bfSJunchao Zhang 
698aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
699d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
700d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
701d460d7bfSJunchao Zhang 
702d460d7bfSJunchao Zhang       // Record for reuse
703d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
704d460d7bfSJunchao Zhang       fs->diag_h   = D;
705d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
706d460d7bfSJunchao Zhang     }
707d460d7bfSJunchao Zhang     // Copy the value
708d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
709d460d7bfSJunchao Zhang     D   = fs->diag_h;
710d460d7bfSJunchao Zhang     Mnz = Ai[m];
711d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
712d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
713d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
714d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
715d460d7bfSJunchao Zhang     }
716d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
717d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
718d460d7bfSJunchao Zhang 
719*204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
720*204a0e31SJunchao Zhang     if (fs->updatedSpSVAnalysis) {
721*204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
722*204a0e31SJunchao Zhang       if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
723*204a0e31SJunchao Zhang     } else
724*204a0e31SJunchao Zhang   #endif
725*204a0e31SJunchao Zhang     {
726d460d7bfSJunchao Zhang       // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
727d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
728d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
729*204a0e31SJunchao Zhang       fs->updatedSpSVAnalysis = PETSC_TRUE;
730*204a0e31SJunchao Zhang     }
731d460d7bfSJunchao Zhang   }
732d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
733d460d7bfSJunchao Zhang }
734d460d7bfSJunchao Zhang 
735d460d7bfSJunchao Zhang // Solve Ut D U x = b
736d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
737d460d7bfSJunchao Zhang {
738d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
739d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
740d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
741d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
742d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
743d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
744d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
745d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
746d460d7bfSJunchao Zhang 
747d460d7bfSJunchao Zhang   PetscFunctionBegin;
748d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
749d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
750d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
751d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
752d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
753d460d7bfSJunchao Zhang 
754d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
755d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
756d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
757d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
758d460d7bfSJunchao Zhang   } else {
759d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
760d460d7bfSJunchao Zhang   }
761d460d7bfSJunchao Zhang 
762d460d7bfSJunchao Zhang   // Solve Ut Y = X
763d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
764d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
765d460d7bfSJunchao Zhang 
766d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
767d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
768d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
769d460d7bfSJunchao Zhang 
770d460d7bfSJunchao Zhang   // Solve U X = Y
771d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
772d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
773d460d7bfSJunchao Zhang   } else {
774d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
775d460d7bfSJunchao Zhang   }
776d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
777d460d7bfSJunchao Zhang 
778d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
779d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
780d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
781d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
782d460d7bfSJunchao Zhang   }
783d460d7bfSJunchao Zhang 
784d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
785d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
786d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
787d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
788d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
789d460d7bfSJunchao Zhang }
790d460d7bfSJunchao Zhang #else
791d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
792d71ae5a4SJacob Faibussowitsch {
793087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
794087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
795aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
796aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
797087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
798087f3262SPaul Mullowney   PetscScalar                       *AAUp;
799087f3262SPaul Mullowney   PetscScalar                       *AALo;
800087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
801087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
802087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
803087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
804087f3262SPaul Mullowney 
805087f3262SPaul Mullowney   PetscFunctionBegin;
8063ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
807c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
808087f3262SPaul Mullowney     try {
8099566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
8109566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
811da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
812087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
8139566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
8149566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
815087f3262SPaul Mullowney 
816087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
817087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
818087f3262SPaul Mullowney         AiUp[n] = nzUpper;
819087f3262SPaul Mullowney         offset  = 0;
820087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
821087f3262SPaul Mullowney           /* set the pointers */
822087f3262SPaul Mullowney           v  = aa + ai[i];
823087f3262SPaul Mullowney           vj = aj + ai[i];
824087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
825087f3262SPaul Mullowney 
826087f3262SPaul Mullowney           /* first, set the diagonal elements */
827087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
82809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
829087f3262SPaul Mullowney           AiUp[i]      = offset;
83009f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
831087f3262SPaul Mullowney 
832087f3262SPaul Mullowney           offset += 1;
833087f3262SPaul Mullowney           if (nz > 0) {
834f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
835f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
836087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
837087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
838087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
839087f3262SPaul Mullowney             }
840087f3262SPaul Mullowney             offset += nz;
841087f3262SPaul Mullowney           }
842087f3262SPaul Mullowney         }
843087f3262SPaul Mullowney 
844aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8459566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
846da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
847087f3262SPaul Mullowney 
848aa372e3fSPaul Mullowney         /* Create the matrix description */
8499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8509566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8511b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
853afb2bd1cSJunchao Zhang   #else
8549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
855afb2bd1cSJunchao Zhang   #endif
8569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
858087f3262SPaul Mullowney 
859aa372e3fSPaul Mullowney         /* set the matrix */
860aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
861aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
862aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
863aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
864aa372e3fSPaul Mullowney 
865aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
866aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
867aa372e3fSPaul Mullowney 
868aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
869aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
870aa372e3fSPaul Mullowney 
871aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
872aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
873aa372e3fSPaul Mullowney 
874afb2bd1cSJunchao Zhang         /* set the operation */
875afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
876afb2bd1cSJunchao Zhang 
877afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8789566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
879261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8801b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8819371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8829371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
884afb2bd1cSJunchao Zhang   #endif
885afb2bd1cSJunchao Zhang 
886aa372e3fSPaul Mullowney         /* perform the solve analysis */
8879371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8889f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8899f7ba44dSJacob Faibussowitsch 
8909566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8919566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
892aa372e3fSPaul Mullowney 
893da79fbbcSStefano Zampini         /* assign the pointer */
894aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
895aa372e3fSPaul Mullowney 
896aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8979566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
898da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
899aa372e3fSPaul Mullowney 
900aa372e3fSPaul Mullowney         /* Create the matrix description */
9019566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
9029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
9031b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9049566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
905afb2bd1cSJunchao Zhang   #else
9069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
907afb2bd1cSJunchao Zhang   #endif
9089566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
9099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
910aa372e3fSPaul Mullowney 
911aa372e3fSPaul Mullowney         /* set the operation */
912aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
913aa372e3fSPaul Mullowney 
914aa372e3fSPaul Mullowney         /* set the matrix */
915aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
916aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
917aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
918aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
919aa372e3fSPaul Mullowney 
920aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
921aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
922aa372e3fSPaul Mullowney 
923aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
924aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
925aa372e3fSPaul Mullowney 
926aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
927aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
928aa372e3fSPaul Mullowney 
929afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9309566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
931261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9321b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9339371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9349371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9359566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
936afb2bd1cSJunchao Zhang   #endif
937afb2bd1cSJunchao Zhang 
938aa372e3fSPaul Mullowney         /* perform the solve analysis */
9399371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9409f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9419f7ba44dSJacob Faibussowitsch 
9429566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9439566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
944aa372e3fSPaul Mullowney 
945da79fbbcSStefano Zampini         /* assign the pointer */
946aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
947087f3262SPaul Mullowney 
9489566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9499566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9509566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
951da79fbbcSStefano Zampini       } else {
952da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
953da79fbbcSStefano Zampini         offset = 0;
954da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
955da79fbbcSStefano Zampini           /* set the pointers */
956da79fbbcSStefano Zampini           v  = aa + ai[i];
957da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
958da79fbbcSStefano Zampini 
959da79fbbcSStefano Zampini           /* first, set the diagonal elements */
960da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
961da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
962da79fbbcSStefano Zampini 
963da79fbbcSStefano Zampini           offset += 1;
964da79fbbcSStefano Zampini           if (nz > 0) {
965f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
966da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
967da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
968da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
969da79fbbcSStefano Zampini             }
970da79fbbcSStefano Zampini             offset += nz;
971da79fbbcSStefano Zampini           }
972da79fbbcSStefano Zampini         }
97328b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
97428b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
975da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
976da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9779566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
978da79fbbcSStefano Zampini       }
9799566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9809566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
981d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
982d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
983d71ae5a4SJacob Faibussowitsch     }
984087f3262SPaul Mullowney   }
9853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
986087f3262SPaul Mullowney }
987d460d7bfSJunchao Zhang #endif
988087f3262SPaul Mullowney 
989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
990d71ae5a4SJacob Faibussowitsch {
991087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
992087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
993087f3262SPaul Mullowney   IS                            ip                 = a->row;
994087f3262SPaul Mullowney   PetscBool                     perm_identity;
995087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
996087f3262SPaul Mullowney 
997087f3262SPaul Mullowney   PetscFunctionBegin;
99828b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
999d460d7bfSJunchao Zhang 
1000b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1001d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
1002d460d7bfSJunchao Zhang #else
10039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
1004ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
1005d460d7bfSJunchao Zhang #endif
1006aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
1007aa372e3fSPaul Mullowney 
1008da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
1009da79fbbcSStefano Zampini 
1010087f3262SPaul Mullowney   /* lower triangular indices */
10119566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1012087f3262SPaul Mullowney   if (!perm_identity) {
10134e4bbfaaSStefano Zampini     IS              iip;
1014da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
10154e4bbfaaSStefano Zampini 
10169566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
10179566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10189566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1019aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1020aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1021aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10224e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10239566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10249566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10259566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10269566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1027da79fbbcSStefano Zampini   }
10283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1029087f3262SPaul Mullowney }
1030087f3262SPaul Mullowney 
1031d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1032d71ae5a4SJacob Faibussowitsch {
1033087f3262SPaul Mullowney   PetscFunctionBegin;
10349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10359566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1036ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1037d460d7bfSJunchao Zhang 
1038b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1039d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1040d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1041d460d7bfSJunchao Zhang #else
1042087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1043d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1044d460d7bfSJunchao Zhang   IS          ip = b->row;
1045d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1046d460d7bfSJunchao Zhang 
10479566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1048087f3262SPaul Mullowney   if (perm_identity) {
1049087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1050087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1051087f3262SPaul Mullowney   } else {
1052087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1053087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1054d460d7bfSJunchao Zhang   }
1055d460d7bfSJunchao Zhang #endif
10564e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10574e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1058087f3262SPaul Mullowney 
1059087f3262SPaul Mullowney   /* get the triangular factors */
10609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1062087f3262SPaul Mullowney }
10639ae82921SPaul Mullowney 
1064b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1065d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1066d71ae5a4SJacob Faibussowitsch {
1067bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1068aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1069aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1070da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1071da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1072aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1073aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1074aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1075aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1076b175d8bbSPaul Mullowney 
1077bda325fcSPaul Mullowney   PetscFunctionBegin;
1078aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10799566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1080da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1081aa372e3fSPaul Mullowney 
1082aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1083aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1084aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10859371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1086aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1087aa372e3fSPaul Mullowney 
1088aa372e3fSPaul Mullowney   /* Create the matrix description */
10899566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10909566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10919566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10929566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10939566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1094aa372e3fSPaul Mullowney 
1095aa372e3fSPaul Mullowney   /* set the operation */
1096aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1097aa372e3fSPaul Mullowney 
1098aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1099aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1100afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1101afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1102aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1103afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1104afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1105afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1106aa372e3fSPaul Mullowney 
1107aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1108afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11099371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
11109371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
11119371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
11129566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1113afb2bd1cSJunchao Zhang   #endif
1114afb2bd1cSJunchao Zhang 
11159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11169f7ba44dSJacob Faibussowitsch   {
11179f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11189f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11199371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1120afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11219f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1122afb2bd1cSJunchao Zhang   #else
11239f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1124afb2bd1cSJunchao Zhang   #endif
11259f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11269f7ba44dSJacob Faibussowitsch   }
11279f7ba44dSJacob Faibussowitsch 
11289566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11299566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1130aa372e3fSPaul Mullowney 
1131afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11329566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1133261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11341b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11369371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11379566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1138afb2bd1cSJunchao Zhang   #endif
1139afb2bd1cSJunchao Zhang 
1140afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11419371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11429f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11439f7ba44dSJacob Faibussowitsch 
11449566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11459566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1146aa372e3fSPaul Mullowney 
1147da79fbbcSStefano Zampini   /* assign the pointer */
1148aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1149aa372e3fSPaul Mullowney 
1150aa372e3fSPaul Mullowney   /*********************************************/
1151aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1152aa372e3fSPaul Mullowney   /*********************************************/
1153aa372e3fSPaul Mullowney 
1154aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11559566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1156da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1157aa372e3fSPaul Mullowney 
1158aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1159aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1160aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11619371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1162aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1163aa372e3fSPaul Mullowney 
1164aa372e3fSPaul Mullowney   /* Create the matrix description */
11659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11669566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11699566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1170aa372e3fSPaul Mullowney 
1171aa372e3fSPaul Mullowney   /* set the operation */
1172aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1173aa372e3fSPaul Mullowney 
1174aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1175aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1176afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1177afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1178aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1179afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1180afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1181afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1182aa372e3fSPaul Mullowney 
1183aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1184afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11859371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11869371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11879371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11889566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1189afb2bd1cSJunchao Zhang   #endif
1190afb2bd1cSJunchao Zhang 
11919566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11929f7ba44dSJacob Faibussowitsch   {
11939f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11949f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11959371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1196afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11979f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1198afb2bd1cSJunchao Zhang   #else
11999f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1200afb2bd1cSJunchao Zhang   #endif
12019f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
12029f7ba44dSJacob Faibussowitsch   }
1203d49cd2b7SBarry Smith 
12049566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12059566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1206aa372e3fSPaul Mullowney 
1207afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
12089566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1209261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
12101b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12119371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12129371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
12139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1214afb2bd1cSJunchao Zhang   #endif
1215afb2bd1cSJunchao Zhang 
1216afb2bd1cSJunchao Zhang   /* perform the solve analysis */
12175f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12199f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1220d49cd2b7SBarry Smith 
12219566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12229566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1223aa372e3fSPaul Mullowney 
1224da79fbbcSStefano Zampini   /* assign the pointer */
1225aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1227bda325fcSPaul Mullowney }
1228d460d7bfSJunchao Zhang #endif
1229bda325fcSPaul Mullowney 
12309371c9d4SSatish Balay struct PetscScalarToPetscInt {
12319371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1232a49f1ed0SStefano Zampini };
1233a49f1ed0SStefano Zampini 
1234d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1235d71ae5a4SJacob Faibussowitsch {
1236aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1237a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1238bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1239bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1240aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1241b175d8bbSPaul Mullowney 
1242bda325fcSPaul Mullowney   PetscFunctionBegin;
12439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1244a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
124528b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1246a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
124708401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12483ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12499566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
125148a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1252a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1253aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12549566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1255aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1258aa372e3fSPaul Mullowney 
1259b06137fdSPaul Mullowney     /* set alpha and beta */
1260f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1261f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1262f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1266b06137fdSPaul Mullowney 
1267aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1268aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1269a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1270554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1271554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1272aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1273a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1274aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1275aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1276a3fdcf43SKarl Rupp 
1277ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
127881902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1279afb2bd1cSJunchao Zhang 
1280afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12813606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12829371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12839371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12853606e59fSJunchao Zhang   #else
12863606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12873606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12883606e59fSJunchao Zhang 
12893606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12903606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12913606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12923606e59fSJunchao Zhang         */
12933606e59fSJunchao Zhang       if (matrixT->num_entries) {
12949371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12959371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12963606e59fSJunchao Zhang 
12973606e59fSJunchao Zhang       } else {
12983606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12993606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13003606e59fSJunchao Zhang       }
13013606e59fSJunchao Zhang   #endif
1302afb2bd1cSJunchao Zhang #endif
1303aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1304afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1305afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1306afb2bd1cSJunchao Zhang #else
1307aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
130851c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
130951c6d536SStefano Zampini       /* First convert HYB to CSR */
1310aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1311aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1312aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1313aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1314aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1315aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1316aa372e3fSPaul Mullowney 
13179371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13189371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1319aa372e3fSPaul Mullowney 
1320aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1321aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1322aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1323aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1324aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1325aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1326aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1327aa372e3fSPaul Mullowney 
13289371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13299371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13309371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1331aa372e3fSPaul Mullowney 
1332aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1333aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13359371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13369371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13379371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1338aa372e3fSPaul Mullowney 
1339aa372e3fSPaul Mullowney       /* assign the pointer */
1340aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13411a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1342aa372e3fSPaul Mullowney       /* delete temporaries */
1343aa372e3fSPaul Mullowney       if (tempT) {
1344aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1345aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1346aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1347aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1348087f3262SPaul Mullowney       }
1349aa372e3fSPaul Mullowney       if (temp) {
1350aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1351aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1352aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1353aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1354aa372e3fSPaul Mullowney       }
1355afb2bd1cSJunchao Zhang #endif
1356aa372e3fSPaul Mullowney     }
1357a49f1ed0SStefano Zampini   }
1358a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1359a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1360a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
136128b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
136228b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
136328b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
136428b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
136528b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
136628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
136728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
136828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1369a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1370a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1371a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13729566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1373a49f1ed0SStefano Zampini     }
1374a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1375a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1376792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1377a49f1ed0SStefano Zampini 
1378a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1380a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1381a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13829371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13839371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13859566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1386a49f1ed0SStefano Zampini #endif
1387a49f1ed0SStefano Zampini 
13881a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13891a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13901a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13911a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13921a2c6b5cSJunchao Zhang 
13931a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13941a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13951a2c6b5cSJunchao Zhang         */
13969371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1397a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13989371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13999371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1400a49f1ed0SStefano Zampini #else
14019371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
14029371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1403a49f1ed0SStefano Zampini #endif
14041a2c6b5cSJunchao Zhang       } else {
14051a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
14061a2c6b5cSJunchao Zhang       }
14071a2c6b5cSJunchao Zhang 
1408a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1410a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
14119566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1412a49f1ed0SStefano Zampini #endif
1413a49f1ed0SStefano Zampini     }
14149371c9d4SSatish Balay     PetscCallThrust(
14159371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1416a49f1ed0SStefano Zampini   }
14179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14189566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1419213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1420213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1421aa372e3fSPaul Mullowney   /* assign the pointer */
1422aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14231a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1425bda325fcSPaul Mullowney }
1426bda325fcSPaul Mullowney 
1427b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1428d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1429d460d7bfSJunchao Zhang {
1430d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1431d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1432d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1433d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1434d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1435d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1436d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1437d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1438d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1439d460d7bfSJunchao Zhang 
1440d460d7bfSJunchao Zhang   PetscFunctionBegin;
1441d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1442d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1443d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1444d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1445d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1446d460d7bfSJunchao Zhang 
1447d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1448d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1449d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1450d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1451d460d7bfSJunchao Zhang   } else {
1452d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1453d460d7bfSJunchao Zhang   }
1454d460d7bfSJunchao Zhang 
1455d460d7bfSJunchao Zhang   // Solve L Y = X
1456d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1457d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1458d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1459d460d7bfSJunchao Zhang 
1460d460d7bfSJunchao Zhang   // Solve U X = Y
1461d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1462d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1463d460d7bfSJunchao Zhang   } else {
1464d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1465d460d7bfSJunchao Zhang   }
1466d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1467d460d7bfSJunchao Zhang 
1468d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1469d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1470d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1471d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1472d460d7bfSJunchao Zhang   }
1473d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1474d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1475d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1476d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1477d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1478d460d7bfSJunchao Zhang }
1479d460d7bfSJunchao Zhang 
1480d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1481d460d7bfSJunchao Zhang {
1482d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1483d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1484d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1485d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1486d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1487d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1488d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1489d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1490d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1491d460d7bfSJunchao Zhang 
1492d460d7bfSJunchao Zhang   PetscFunctionBegin;
1493d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1494d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1495d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1496d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1497d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1498d460d7bfSJunchao Zhang 
1499d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1500d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1501d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1502d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1503d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1504d460d7bfSJunchao Zhang   }
1505d460d7bfSJunchao Zhang 
1506d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1507d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1508d460d7bfSJunchao Zhang 
1509d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1510d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1511d460d7bfSJunchao Zhang   }
1512d460d7bfSJunchao Zhang 
1513d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1514d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1515d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1516d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1517d460d7bfSJunchao Zhang 
1518d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1519d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1520d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1521d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1522d460d7bfSJunchao Zhang   } else {
1523d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1524d460d7bfSJunchao Zhang   }
1525d460d7bfSJunchao Zhang 
1526d460d7bfSJunchao Zhang   // Solve Ut Y = X
1527d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1528d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1529d460d7bfSJunchao Zhang 
1530d460d7bfSJunchao Zhang   // Solve Lt X = Y
1531d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1532d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1533d460d7bfSJunchao Zhang   } else {
1534d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1535d460d7bfSJunchao Zhang   }
1536d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1537d460d7bfSJunchao Zhang 
1538d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1539d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1540d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1541d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1542d460d7bfSJunchao Zhang   }
1543d460d7bfSJunchao Zhang 
1544d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1545d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1546d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1547d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1548d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1549d460d7bfSJunchao Zhang }
1550d460d7bfSJunchao Zhang #else
1551a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1553d71ae5a4SJacob Faibussowitsch {
1554c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1555465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1556465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1557465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1558465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1559bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1560aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1561aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1562aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1563bda325fcSPaul Mullowney 
1564bda325fcSPaul Mullowney   PetscFunctionBegin;
1565aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1566aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15679566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1568aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1569aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1570bda325fcSPaul Mullowney   }
1571bda325fcSPaul Mullowney 
1572bda325fcSPaul Mullowney   /* Get the GPU pointers */
15739566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15749566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1575c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1576c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1577bda325fcSPaul Mullowney 
15789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1579aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15809371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1581aa372e3fSPaul Mullowney 
1582aa372e3fSPaul Mullowney   /* First, solve U */
15839f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15849f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1585aa372e3fSPaul Mullowney 
1586aa372e3fSPaul Mullowney   /* Then, solve L */
15879f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15889f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1589aa372e3fSPaul Mullowney 
1590aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15919371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1592aa372e3fSPaul Mullowney 
1593aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1594a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1595bda325fcSPaul Mullowney 
1596bda325fcSPaul Mullowney   /* restore */
15979566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15989566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1602bda325fcSPaul Mullowney }
1603bda325fcSPaul Mullowney 
1604d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1605d71ae5a4SJacob Faibussowitsch {
1606465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1607465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1608bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1609aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1610aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1611aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1612bda325fcSPaul Mullowney 
1613bda325fcSPaul Mullowney   PetscFunctionBegin;
1614aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1615aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
16169566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1617aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1618aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1619bda325fcSPaul Mullowney   }
1620bda325fcSPaul Mullowney 
1621bda325fcSPaul Mullowney   /* Get the GPU pointers */
16229566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16239566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1624bda325fcSPaul Mullowney 
16259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1626aa372e3fSPaul Mullowney   /* First, solve U */
16279f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16289f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1629aa372e3fSPaul Mullowney 
1630aa372e3fSPaul Mullowney   /* Then, solve L */
16319f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16329f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1633bda325fcSPaul Mullowney 
1634bda325fcSPaul Mullowney   /* restore */
16359566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16369566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1640bda325fcSPaul Mullowney }
1641bda325fcSPaul Mullowney 
1642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1643d71ae5a4SJacob Faibussowitsch {
1644465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1645465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1646465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1647465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16489ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1649aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1650aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1651aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16529ae82921SPaul Mullowney 
16539ae82921SPaul Mullowney   PetscFunctionBegin;
1654e057df02SPaul Mullowney   /* Get the GPU pointers */
16559566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16569566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1657c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1658c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16599ae82921SPaul Mullowney 
16609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1661aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16629371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1663aa372e3fSPaul Mullowney 
1664aa372e3fSPaul Mullowney   /* Next, solve L */
16659f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16669f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1667aa372e3fSPaul Mullowney 
1668aa372e3fSPaul Mullowney   /* Then, solve U */
16699f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16709f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1671d49cd2b7SBarry Smith 
16724e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16739371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16749ae82921SPaul Mullowney 
16759566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16769566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16809ae82921SPaul Mullowney }
16819ae82921SPaul Mullowney 
1682d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1683d71ae5a4SJacob Faibussowitsch {
1684465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1685465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16869ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1687aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1688aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1689aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16909ae82921SPaul Mullowney 
16919ae82921SPaul Mullowney   PetscFunctionBegin;
1692e057df02SPaul Mullowney   /* Get the GPU pointers */
16939566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16949566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16959ae82921SPaul Mullowney 
16969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1697aa372e3fSPaul Mullowney   /* First, solve L */
16989f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16999f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1700d49cd2b7SBarry Smith 
1701aa372e3fSPaul Mullowney   /* Next, solve U */
17029f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
17039f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
17049ae82921SPaul Mullowney 
17059566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
17069566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
17079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
17093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
17109ae82921SPaul Mullowney }
1711d460d7bfSJunchao Zhang #endif
17129ae82921SPaul Mullowney 
1713b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
17148eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1715d71ae5a4SJacob Faibussowitsch {
1716da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1717da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1718da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1719da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1720da112707SJunchao Zhang   PetscInt                      m, nz;
1721da112707SJunchao Zhang   PetscBool                     flg;
1722da112707SJunchao Zhang 
1723da112707SJunchao Zhang   PetscFunctionBegin;
1724da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1725da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1726da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1727da112707SJunchao Zhang   }
1728da112707SJunchao Zhang 
1729da112707SJunchao Zhang   /* Copy A's value to fact */
1730da112707SJunchao Zhang   m  = fact->rmap->n;
1731da112707SJunchao Zhang   nz = aij->nz;
1732da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1733da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1734da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1735da112707SJunchao Zhang 
1736bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1737da112707SJunchao Zhang   /* Factorize fact inplace */
17389371c9d4SSatish Balay   if (m)
17399371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1740d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1741da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1742da112707SJunchao Zhang     int              numerical_zero;
1743da112707SJunchao Zhang     cusparseStatus_t status;
1744da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1745da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1746da112707SJunchao Zhang   }
1747da112707SJunchao Zhang 
1748*204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
1749*204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
1750*204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1751*204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
1752*204a0e31SJunchao Zhang   } else
1753*204a0e31SJunchao Zhang   #endif
1754*204a0e31SJunchao Zhang   {
175512ba2bc6SJunchao Zhang     /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
175612ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
175712ba2bc6SJunchao Zhang     */
17589371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1759da112707SJunchao Zhang 
17609371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1761da112707SJunchao Zhang 
1762*204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
176312ba2bc6SJunchao Zhang     /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
176412ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
1765*204a0e31SJunchao Zhang   }
176612ba2bc6SJunchao Zhang 
1767da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1768d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1769d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1770da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1771da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1772bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1773da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1775da112707SJunchao Zhang }
1776da112707SJunchao Zhang 
17778eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1778d71ae5a4SJacob Faibussowitsch {
1779da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1780da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1781da112707SJunchao Zhang   PetscInt                      m, nz;
1782da112707SJunchao Zhang 
1783da112707SJunchao Zhang   PetscFunctionBegin;
1784da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1785da112707SJunchao Zhang     PetscInt  i;
1786da112707SJunchao Zhang     PetscBool flg, missing;
1787da112707SJunchao Zhang 
1788da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1789da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1790da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1791da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1792da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1793da112707SJunchao Zhang   }
1794da112707SJunchao Zhang 
1795da112707SJunchao Zhang   /* Free the old stale stuff */
1796da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1797da112707SJunchao Zhang 
1798da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1799da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1800da112707SJunchao Zhang    */
1801da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1802da112707SJunchao Zhang 
1803da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1804da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1805da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1806da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1807da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   aij->row = NULL;
1810da112707SJunchao Zhang   aij->col = NULL;
1811da112707SJunchao Zhang 
1812da112707SJunchao Zhang   /* ====================================================================== */
1813da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1814da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1815da112707SJunchao Zhang   /* ====================================================================== */
1816da112707SJunchao Zhang   const int *Ai, *Aj;
1817da112707SJunchao Zhang 
1818da112707SJunchao Zhang   m  = fact->rmap->n;
1819da112707SJunchao Zhang   nz = aij->nz;
1820da112707SJunchao Zhang 
1821f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1822f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1823f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1824d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1825d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1826d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1827da112707SJunchao Zhang 
1828da112707SJunchao Zhang   /* ====================================================================== */
1829da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1830da112707SJunchao Zhang   /* ====================================================================== */
1831da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1832da112707SJunchao Zhang   cusparseDiagType_t diagType;
1833da112707SJunchao Zhang 
1834da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1835da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1836da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1837da112707SJunchao Zhang 
1838da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1839da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1840da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1841da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1842da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1843da112707SJunchao Zhang   */
1844da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1845da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1846d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18479371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18489371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1851da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1852d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18539371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18549371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1855da112707SJunchao Zhang 
1856da112707SJunchao Zhang   /* ========================================================================= */
1857da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1858da112707SJunchao Zhang   /* ========================================================================= */
1859da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18609371c9d4SSatish Balay   if (m)
18619371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1862d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1863da112707SJunchao Zhang 
1864da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1865da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1866da112707SJunchao Zhang 
1867da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1868da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1869da112707SJunchao Zhang 
1870da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18719371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1872da112707SJunchao Zhang 
1873da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18749371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1875da112707SJunchao Zhang 
1876da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
187712ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
187812ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
187912ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1880da112707SJunchao Zhang    */
188112ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
188212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
188312ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1884da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
188512ba2bc6SJunchao Zhang   } else {
188612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
188712ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1888da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
188912ba2bc6SJunchao Zhang   }
1890da112707SJunchao Zhang 
1891da112707SJunchao Zhang   /* ========================================================================== */
1892da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1893da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1894da112707SJunchao Zhang   /* ========================================================================== */
1895da112707SJunchao Zhang   int              structural_zero;
1896da112707SJunchao Zhang   cusparseStatus_t status;
1897da112707SJunchao Zhang 
1898da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18999371c9d4SSatish Balay   if (m)
19009371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1901d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1902da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1903da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1904da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1905da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1906da112707SJunchao Zhang   }
1907da112707SJunchao Zhang 
1908da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19090dd8c0acSJunchao Zhang   {
1910da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19110dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1912da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1913da112707SJunchao Zhang 
1914da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1915da112707SJunchao Zhang     Ai    = Aseq->i;
1916da112707SJunchao Zhang     Adiag = Aseq->diag;
1917da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1918da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1919da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1920da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1921da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1922da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1923da112707SJunchao Zhang         */
1924da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1925da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1926da112707SJunchao Zhang       }
1927da112707SJunchao Zhang     }
1928da112707SJunchao Zhang     fs->numericFactFlops = flops;
19290dd8c0acSJunchao Zhang   }
1930da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1932da112707SJunchao Zhang }
1933da112707SJunchao Zhang 
1934d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1935d71ae5a4SJacob Faibussowitsch {
1936da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1937da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1938da112707SJunchao Zhang   const PetscScalar            *barray;
1939da112707SJunchao Zhang   PetscScalar                  *xarray;
1940da112707SJunchao Zhang 
1941da112707SJunchao Zhang   PetscFunctionBegin;
1942da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1943da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1944da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1945da112707SJunchao Zhang 
1946da112707SJunchao Zhang   /* Solve L*y = b */
1947da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1948da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19499371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19509371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1951da112707SJunchao Zhang 
1952da112707SJunchao Zhang   /* Solve Lt*x = y */
1953da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19549371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19559371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1956da112707SJunchao Zhang 
1957da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1958da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1959da112707SJunchao Zhang 
1960da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1961da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1963da112707SJunchao Zhang }
1964da112707SJunchao Zhang 
19658eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1966d71ae5a4SJacob Faibussowitsch {
1967da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1968da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1969da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1970da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1971da112707SJunchao Zhang   PetscInt                      m, nz;
1972da112707SJunchao Zhang   PetscBool                     flg;
1973da112707SJunchao Zhang 
1974da112707SJunchao Zhang   PetscFunctionBegin;
1975da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1976da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1977da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1978da112707SJunchao Zhang   }
1979da112707SJunchao Zhang 
1980da112707SJunchao Zhang   /* Copy A's value to fact */
1981da112707SJunchao Zhang   m  = fact->rmap->n;
1982da112707SJunchao Zhang   nz = aij->nz;
1983da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1984da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1985da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1986da112707SJunchao Zhang 
1987da112707SJunchao Zhang   /* Factorize fact inplace */
1988da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1989da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1990da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1991da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1992da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1993da112707SJunchao Zhang    */
1994d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1995da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1996da112707SJunchao Zhang     int              numerical_zero;
1997da112707SJunchao Zhang     cusparseStatus_t status;
1998da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1999da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
2000da112707SJunchao Zhang   }
2001da112707SJunchao Zhang 
2002*204a0e31SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1)
2003*204a0e31SJunchao Zhang   if (fs->updatedSpSVAnalysis) {
2004*204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2005*204a0e31SJunchao Zhang     if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL));
2006*204a0e31SJunchao Zhang   } else
2007*204a0e31SJunchao Zhang   #endif
2008*204a0e31SJunchao Zhang   {
20099371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
2010da112707SJunchao Zhang 
2011da112707SJunchao Zhang     /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2012da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2013da112707SJunchao Zhang   */
20149371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
2015*204a0e31SJunchao Zhang     fs->updatedSpSVAnalysis = PETSC_TRUE;
2016*204a0e31SJunchao Zhang   }
2017da112707SJunchao Zhang 
2018da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2019da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2020da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2021da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
2022da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2023da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
20243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2025da112707SJunchao Zhang }
2026da112707SJunchao Zhang 
20278eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
2028d71ae5a4SJacob Faibussowitsch {
2029da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
2030da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
2031da112707SJunchao Zhang   PetscInt                      m, nz;
2032da112707SJunchao Zhang 
2033da112707SJunchao Zhang   PetscFunctionBegin;
2034da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2035da112707SJunchao Zhang     PetscInt  i;
2036da112707SJunchao Zhang     PetscBool flg, missing;
2037da112707SJunchao Zhang 
2038da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2039da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2040da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2041da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2042da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2043da112707SJunchao Zhang   }
2044da112707SJunchao Zhang 
2045da112707SJunchao Zhang   /* Free the old stale stuff */
2046da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2047da112707SJunchao Zhang 
2048da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2049da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2050da112707SJunchao Zhang    */
2051da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2052da112707SJunchao Zhang 
2053da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2054da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2055da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2056da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2057da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2058da112707SJunchao Zhang 
2059da112707SJunchao Zhang   aij->row = NULL;
2060da112707SJunchao Zhang   aij->col = NULL;
2061da112707SJunchao Zhang 
2062da112707SJunchao Zhang   /* ====================================================================== */
2063da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2064da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2065da112707SJunchao Zhang   /* ====================================================================== */
2066da112707SJunchao Zhang   const int *Ai, *Aj;
2067da112707SJunchao Zhang 
2068da112707SJunchao Zhang   m  = fact->rmap->n;
2069da112707SJunchao Zhang   nz = aij->nz;
2070da112707SJunchao Zhang 
2071f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2072f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2073da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2074da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2075d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2076d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2077da112707SJunchao Zhang 
2078da112707SJunchao Zhang   /* ====================================================================== */
2079da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2080da112707SJunchao Zhang   /* ====================================================================== */
2081da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2082da112707SJunchao Zhang   cusparseDiagType_t diagType;
2083da112707SJunchao Zhang 
2084da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2085da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2086da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2087da112707SJunchao Zhang 
2088da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2089da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2090da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2091da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2092da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2093da112707SJunchao Zhang   */
2094da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2095da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2096d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20979371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20989371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2099da112707SJunchao Zhang 
2100da112707SJunchao Zhang   /* ========================================================================= */
2101da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2102da112707SJunchao Zhang   /* ========================================================================= */
2103da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2104d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2105da112707SJunchao Zhang 
2106da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2107da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2108da112707SJunchao Zhang 
2109da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2110da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2111da112707SJunchao Zhang 
2112da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
21139371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2114da112707SJunchao Zhang 
2115da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
21169371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2117da112707SJunchao Zhang 
211812ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
211912ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
212012ba2bc6SJunchao Zhang    */
212112ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
212212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
212312ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2124da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
212512ba2bc6SJunchao Zhang   } else {
212612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
212712ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
212812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
212912ba2bc6SJunchao Zhang   }
2130da112707SJunchao Zhang 
2131da112707SJunchao Zhang   /* ========================================================================== */
2132da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2133da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2134da112707SJunchao Zhang   /* ========================================================================== */
2135da112707SJunchao Zhang   int              structural_zero;
2136da112707SJunchao Zhang   cusparseStatus_t status;
2137da112707SJunchao Zhang 
2138da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2139d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2140da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2141da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2142da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2143da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2144da112707SJunchao Zhang   }
2145da112707SJunchao Zhang 
2146da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21470dd8c0acSJunchao Zhang   {
2148da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21490dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2150da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2151da112707SJunchao Zhang 
2152da112707SJunchao Zhang     Ai = Aseq->i;
2153da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2154da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2155da112707SJunchao Zhang       if (nzRow > 1) {
2156da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2157da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2158da112707SJunchao Zhang         */
2159da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2160da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2161da112707SJunchao Zhang       }
2162da112707SJunchao Zhang     }
2163da112707SJunchao Zhang     fs->numericFactFlops = flops;
21640dd8c0acSJunchao Zhang   }
2165da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2167da112707SJunchao Zhang }
2168da112707SJunchao Zhang #endif
2169da112707SJunchao Zhang 
2170d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2171d460d7bfSJunchao Zhang {
2172b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2173b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2174d460d7bfSJunchao Zhang 
2175d460d7bfSJunchao Zhang   PetscFunctionBegin;
2176d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2177d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2178d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2179d460d7bfSJunchao Zhang 
2180d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2181b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2182d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2183d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2184d460d7bfSJunchao Zhang #else
2185d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2186d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2187d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2188d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2189d460d7bfSJunchao Zhang 
2190d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2191d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2192d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2193d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2194d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2195d460d7bfSJunchao Zhang     } else {
2196d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2197d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2198d460d7bfSJunchao Zhang     }
2199d460d7bfSJunchao Zhang #endif
2200d460d7bfSJunchao Zhang   }
2201d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2202d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2203d460d7bfSJunchao Zhang 
2204d460d7bfSJunchao Zhang   /* get the triangular factors */
2205d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2206d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2207d460d7bfSJunchao Zhang }
2208d460d7bfSJunchao Zhang 
2209d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2210d460d7bfSJunchao Zhang {
2211d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2212d460d7bfSJunchao Zhang 
2213d460d7bfSJunchao Zhang   PetscFunctionBegin;
2214d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2215d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2216d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2217d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2218d460d7bfSJunchao Zhang }
2219d460d7bfSJunchao Zhang 
2220d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2221d71ae5a4SJacob Faibussowitsch {
2222da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2223da112707SJunchao Zhang 
2224da112707SJunchao Zhang   PetscFunctionBegin;
2225b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2226bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2227f82ac72cSJunchao Zhang   if (!info->factoronhost) {
2228da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2229da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2230bc996fdcSJunchao Zhang   }
2231da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2232da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2233da112707SJunchao Zhang   } else
2234da112707SJunchao Zhang #endif
2235da112707SJunchao Zhang   {
2236da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2237da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2238da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2239da112707SJunchao Zhang   }
22403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2241da112707SJunchao Zhang }
2242da112707SJunchao Zhang 
2243d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2244d71ae5a4SJacob Faibussowitsch {
2245da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2246da112707SJunchao Zhang 
2247da112707SJunchao Zhang   PetscFunctionBegin;
2248b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2249bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2250f82ac72cSJunchao Zhang   if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity));
2251da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2252da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2253da112707SJunchao Zhang   } else
2254da112707SJunchao Zhang #endif
2255da112707SJunchao Zhang   {
2256da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2257da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2258da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2259da112707SJunchao Zhang   }
22603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2261da112707SJunchao Zhang }
2262da112707SJunchao Zhang 
2263d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2264d71ae5a4SJacob Faibussowitsch {
2265da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2266da112707SJunchao Zhang 
2267da112707SJunchao Zhang   PetscFunctionBegin;
2268da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2269da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2270da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2272da112707SJunchao Zhang }
2273da112707SJunchao Zhang 
227466976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2275d71ae5a4SJacob Faibussowitsch {
2276841d4cb1SJunchao Zhang   PetscFunctionBegin;
2277841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2279841d4cb1SJunchao Zhang }
2280841d4cb1SJunchao Zhang 
2281841d4cb1SJunchao Zhang /*MC
2282841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
228311a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2284841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2285841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
228611a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2287841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2288841d4cb1SJunchao Zhang 
2289841d4cb1SJunchao Zhang   Level: beginner
2290841d4cb1SJunchao Zhang 
22911cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22922ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2293841d4cb1SJunchao Zhang M*/
2294841d4cb1SJunchao Zhang 
2295d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2296d71ae5a4SJacob Faibussowitsch {
2297841d4cb1SJunchao Zhang   PetscInt n = A->rmap->n;
2298841d4cb1SJunchao Zhang 
2299841d4cb1SJunchao Zhang   PetscFunctionBegin;
2300841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2301841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2302b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2303841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2304841d4cb1SJunchao Zhang 
2305841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2306841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2307841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2308841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2309841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2310841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2311841d4cb1SJunchao Zhang     } else {
2312841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2313841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2314841d4cb1SJunchao Zhang     }
2315841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2316841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2317841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2318841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2319841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2320841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2321841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2322841d4cb1SJunchao Zhang     } else {
2323841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2324841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2325841d4cb1SJunchao Zhang     }
2326841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2327841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2328841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2329841d4cb1SJunchao Zhang 
2330841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2331841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2332f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2334841d4cb1SJunchao Zhang }
2335841d4cb1SJunchao Zhang 
2336d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2337d71ae5a4SJacob Faibussowitsch {
23387e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23397e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2340b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2341da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23420dd8c0acSJunchao Zhang #endif
23437e8381f9SStefano Zampini 
23447e8381f9SStefano Zampini   PetscFunctionBegin;
23457e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23469566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2347da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2348da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2350da112707SJunchao Zhang     }
2351b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2352da112707SJunchao Zhang     else if (fs->csrVal) {
2353da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2354da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2355da112707SJunchao Zhang     }
2356da112707SJunchao Zhang #endif
23579371c9d4SSatish Balay     else
23589371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23599566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23609566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23617e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23627e8381f9SStefano Zampini   }
23633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23647e8381f9SStefano Zampini }
23657e8381f9SStefano Zampini 
2366d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2367d71ae5a4SJacob Faibussowitsch {
23687e8381f9SStefano Zampini   PetscFunctionBegin;
23699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
237067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237267a45760SJunchao Zhang }
237367a45760SJunchao Zhang 
2374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375d71ae5a4SJacob Faibussowitsch {
237667a45760SJunchao Zhang   PetscFunctionBegin;
23777e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
237867a45760SJunchao Zhang   *array         = NULL;
23793ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238067a45760SJunchao Zhang }
238167a45760SJunchao Zhang 
2382d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2383d71ae5a4SJacob Faibussowitsch {
238467a45760SJunchao Zhang   PetscFunctionBegin;
23859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
238667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238867a45760SJunchao Zhang }
238967a45760SJunchao Zhang 
23908eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2391d71ae5a4SJacob Faibussowitsch {
239267a45760SJunchao Zhang   PetscFunctionBegin;
239367a45760SJunchao Zhang   *array = NULL;
23943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
239567a45760SJunchao Zhang }
239667a45760SJunchao Zhang 
2397d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2398d71ae5a4SJacob Faibussowitsch {
239967a45760SJunchao Zhang   PetscFunctionBegin;
240067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
24013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
240267a45760SJunchao Zhang }
240367a45760SJunchao Zhang 
2404d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2405d71ae5a4SJacob Faibussowitsch {
240667a45760SJunchao Zhang   PetscFunctionBegin;
240767a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
240867a45760SJunchao Zhang   *array         = NULL;
24093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24107e8381f9SStefano Zampini }
24117e8381f9SStefano Zampini 
2412d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2413d71ae5a4SJacob Faibussowitsch {
24147ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
24157ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
24167ee59b9bSJunchao Zhang 
24177ee59b9bSJunchao Zhang   PetscFunctionBegin;
24187ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
24197ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
24207ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
24217ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
24227ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
24237ee59b9bSJunchao Zhang 
24247ee59b9bSJunchao Zhang   if (i) {
24257ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24267ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24277ee59b9bSJunchao Zhang #else
24287ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24297ee59b9bSJunchao Zhang #endif
24307ee59b9bSJunchao Zhang   }
24317ee59b9bSJunchao Zhang   if (j) {
24327ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24337ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24347ee59b9bSJunchao Zhang #else
24357ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24367ee59b9bSJunchao Zhang #endif
24377ee59b9bSJunchao Zhang   }
24387ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24397ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24417ee59b9bSJunchao Zhang }
24427ee59b9bSJunchao Zhang 
2443d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2444d71ae5a4SJacob Faibussowitsch {
2445aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24467c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24479ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2448213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2449aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2450abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24519ae82921SPaul Mullowney 
24529ae82921SPaul Mullowney   PetscFunctionBegin;
245328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2454c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2455a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2456a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2457afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
245885ba7357SStefano Zampini 
245908401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24609566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2461afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24629566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2463f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24649566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24659566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
246634d6c7a5SJose E. Roman     } else {
2467abb89eb1SStefano Zampini       PetscInt nnz;
24689566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24699566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24709566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24717c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
247281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2473a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2474a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24759ae82921SPaul Mullowney       try {
24769ae82921SPaul Mullowney         if (a->compressedrow.use) {
24779ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24789ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24799ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24809ae82921SPaul Mullowney         } else {
2481213423ffSJunchao Zhang           m    = A->rmap->n;
2482213423ffSJunchao Zhang           ii   = a->i;
2483e6e9a74fSStefano Zampini           ridx = NULL;
24849ae82921SPaul Mullowney         }
248508401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24869371c9d4SSatish Balay         if (!a->a) {
24879371c9d4SSatish Balay           nnz  = ii[m];
24889371c9d4SSatish Balay           both = PETSC_FALSE;
24899371c9d4SSatish Balay         } else nnz = a->nz;
249008401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24919ae82921SPaul Mullowney 
249285ba7357SStefano Zampini         /* create cusparse matrix */
2493abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2494aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24969566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24989ae82921SPaul Mullowney 
2499f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2500f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2501f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
25029566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25039566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25049566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
25059566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2506b06137fdSPaul Mullowney 
2507aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2508aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2509aa372e3fSPaul Mullowney           /* set the matrix */
2510afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2511afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2512afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2513abb89eb1SStefano Zampini           mat->num_entries = nnz;
2514ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2515afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
25169ae82921SPaul Mullowney 
2517ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2518abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2519aa372e3fSPaul Mullowney 
2520ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2521abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2522aa372e3fSPaul Mullowney 
2523aa372e3fSPaul Mullowney           /* assign the pointer */
2524afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2525afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2526afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25279371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25289371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25299371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2530afb2bd1cSJunchao Zhang           }
2531afb2bd1cSJunchao Zhang #endif
2532aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2533afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2534afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2535afb2bd1cSJunchao Zhang #else
2536afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2537afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2538afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2539abb89eb1SStefano Zampini           mat->num_entries = nnz;
2540ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2541afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2542aa372e3fSPaul Mullowney 
2543ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2544abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2545aa372e3fSPaul Mullowney 
2546ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2547abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2548aa372e3fSPaul Mullowney 
2549aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25509566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25519371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25529371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25539371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2554aa372e3fSPaul Mullowney           /* assign the pointer */
2555aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2556aa372e3fSPaul Mullowney 
2557afb2bd1cSJunchao Zhang           if (mat) {
2558afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2559afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2560afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2561afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2562087f3262SPaul Mullowney           }
2563afb2bd1cSJunchao Zhang #endif
2564087f3262SPaul Mullowney         }
2565ca45077fSPaul Mullowney 
2566aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2567213423ffSJunchao Zhang         if (a->compressedrow.use) {
2568ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2569ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2570aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2571213423ffSJunchao Zhang           tmp = m;
2572213423ffSJunchao Zhang         } else {
2573213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2574213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2575213423ffSJunchao Zhang           tmp                        = 0;
2576213423ffSJunchao Zhang         }
25779566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2578aa372e3fSPaul Mullowney 
2579aa372e3fSPaul Mullowney         /* assign the pointer */
2580aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2581d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2582d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2583d71ae5a4SJacob Faibussowitsch       }
25849566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25859566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
258634d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
258734d6c7a5SJose E. Roman     }
2588abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25899ae82921SPaul Mullowney   }
25903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25919ae82921SPaul Mullowney }
25929ae82921SPaul Mullowney 
25939371c9d4SSatish Balay struct VecCUDAPlusEquals {
2594aa372e3fSPaul Mullowney   template <typename Tuple>
2595d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2596d71ae5a4SJacob Faibussowitsch   {
2597aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2598aa372e3fSPaul Mullowney   }
2599aa372e3fSPaul Mullowney };
2600aa372e3fSPaul Mullowney 
26019371c9d4SSatish Balay struct VecCUDAEquals {
26027e8381f9SStefano Zampini   template <typename Tuple>
2603d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2604d71ae5a4SJacob Faibussowitsch   {
26057e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
26067e8381f9SStefano Zampini   }
26077e8381f9SStefano Zampini };
26087e8381f9SStefano Zampini 
26099371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2610e6e9a74fSStefano Zampini   template <typename Tuple>
2611d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2612d71ae5a4SJacob Faibussowitsch   {
2613e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2614e6e9a74fSStefano Zampini   }
2615e6e9a74fSStefano Zampini };
2616e6e9a74fSStefano Zampini 
2617afb2bd1cSJunchao Zhang struct MatMatCusparse {
2618ccdfe979SStefano Zampini   PetscBool      cisdense;
2619ccdfe979SStefano Zampini   PetscScalar   *Bt;
2620ccdfe979SStefano Zampini   Mat            X;
2621fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2622fcdce8c4SStefano Zampini   PetscLogDouble flops;
2623fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2624b4285af6SJunchao Zhang 
2625afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2626fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2627afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2628afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2629afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2630afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2631b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2632b4285af6SJunchao Zhang   void *dBuffer4;
2633b4285af6SJunchao Zhang   void *dBuffer5;
2634b4285af6SJunchao Zhang   #endif
2635fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2636fcdce8c4SStefano Zampini   void                 *mmBuffer;
2637fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2638fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2639afb2bd1cSJunchao Zhang #endif
2640afb2bd1cSJunchao Zhang };
2641ccdfe979SStefano Zampini 
2642d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2643d71ae5a4SJacob Faibussowitsch {
2644ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2645ccdfe979SStefano Zampini 
2646ccdfe979SStefano Zampini   PetscFunctionBegin;
26479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2648fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2649afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26509566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26519566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26529566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26539566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2654b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26559566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26569566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2657b4285af6SJunchao Zhang   #endif
26589566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26599566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2660afb2bd1cSJunchao Zhang #endif
26619566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26629566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2664ccdfe979SStefano Zampini }
2665ccdfe979SStefano Zampini 
26664742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2667ccdfe979SStefano Zampini 
2668d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2669d71ae5a4SJacob Faibussowitsch {
2670ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2671ccdfe979SStefano Zampini   Mat                           A, B;
2672afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2673ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2674ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2675ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2676ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2677ccdfe979SStefano Zampini   const PetscScalar            *barray;
2678ccdfe979SStefano Zampini   PetscScalar                  *carray;
2679ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2680ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2681ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2682ccdfe979SStefano Zampini 
2683ccdfe979SStefano Zampini   PetscFunctionBegin;
2684ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
268528b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2686ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2687ccdfe979SStefano Zampini   A      = product->A;
2688ccdfe979SStefano Zampini   B      = product->B;
26899566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
269028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2691ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2692ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
269328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2695ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2696ccdfe979SStefano Zampini   switch (product->type) {
2697ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2698ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2699ccdfe979SStefano Zampini     mat = cusp->mat;
2700ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2701ccdfe979SStefano Zampini     m   = A->rmap->n;
2702ccdfe979SStefano Zampini     n   = B->cmap->n;
2703ccdfe979SStefano Zampini     break;
2704ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
27051a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2706e6e9a74fSStefano Zampini       mat = cusp->mat;
2707e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2708e6e9a74fSStefano Zampini     } else {
27099566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2710ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2711ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2712e6e9a74fSStefano Zampini     }
2713ccdfe979SStefano Zampini     m = A->cmap->n;
2714ccdfe979SStefano Zampini     n = B->cmap->n;
2715ccdfe979SStefano Zampini     break;
2716ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2717ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2718ccdfe979SStefano Zampini     mat = cusp->mat;
2719ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2720ccdfe979SStefano Zampini     m   = A->rmap->n;
2721ccdfe979SStefano Zampini     n   = B->rmap->n;
2722ccdfe979SStefano Zampini     break;
2723d71ae5a4SJacob Faibussowitsch   default:
2724d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2725ccdfe979SStefano Zampini   }
272628b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2727ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2728ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27299566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27309566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2731cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2732afb2bd1cSJunchao Zhang 
27339566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2734c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2735cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27369566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2737c8378d12SStefano Zampini   } else {
2738cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27399566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2740c8378d12SStefano Zampini   }
2741c8378d12SStefano Zampini 
27429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2743afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2744afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2745fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2746fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2747fe5544b9SJunchao Zhang   #else
2748fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2749fe5544b9SJunchao Zhang   #endif
2750fe5544b9SJunchao Zhang 
2751a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2752afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2753fcdce8c4SStefano Zampini     size_t mmBufferSize;
27549371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27559371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27569371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27579371c9d4SSatish Balay     }
2758afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27599566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2760afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2761afb2bd1cSJunchao Zhang     }
2762c8378d12SStefano Zampini 
27639371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27649371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27659371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27669371c9d4SSatish Balay     }
2767afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27689566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2769afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2770afb2bd1cSJunchao Zhang     }
2771afb2bd1cSJunchao Zhang 
2772fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2773fe5544b9SJunchao Zhang     if (matADescr) {
277417f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2775fe5544b9SJunchao Zhang       matADescr = NULL;
2776fe5544b9SJunchao Zhang     }
2777fe5544b9SJunchao Zhang   #endif
2778fe5544b9SJunchao Zhang 
2779fe5544b9SJunchao Zhang     if (!matADescr) {
2780fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27819371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27829371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2783afb2bd1cSJunchao Zhang     }
2784fe5544b9SJunchao Zhang 
2785fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2786fe5544b9SJunchao Zhang 
2787fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2790fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2791fcdce8c4SStefano Zampini     }
2792fe5544b9SJunchao Zhang 
2793fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2794fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2795fe5544b9SJunchao Zhang   #endif
2796fe5544b9SJunchao Zhang 
2797afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2798afb2bd1cSJunchao Zhang   } else {
2799afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2800fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
28019566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
28029566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2803afb2bd1cSJunchao Zhang   }
2804afb2bd1cSJunchao Zhang 
2805afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2806fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2807afb2bd1cSJunchao Zhang #else
2808afb2bd1cSJunchao Zhang   PetscInt k;
2809afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2810ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2811ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2812ccdfe979SStefano Zampini     cublasStatus_t cerr;
2813ccdfe979SStefano Zampini 
28149566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
28159371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
28169371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2817ccdfe979SStefano Zampini     blda = B->cmap->n;
2818afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2819afb2bd1cSJunchao Zhang   } else {
2820afb2bd1cSJunchao Zhang     k = B->rmap->n;
2821ccdfe979SStefano Zampini   }
2822ccdfe979SStefano Zampini 
2823afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
28249371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28259371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2826afb2bd1cSJunchao Zhang #endif
28279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2829cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2830ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2831cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28324742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2833ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2834cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28354742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2836ccdfe979SStefano Zampini   } else {
2837cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2838ccdfe979SStefano Zampini   }
283948a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
284048a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2842ccdfe979SStefano Zampini }
2843ccdfe979SStefano Zampini 
2844d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2845d71ae5a4SJacob Faibussowitsch {
2846ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2847ccdfe979SStefano Zampini   Mat                 A, B;
2848ccdfe979SStefano Zampini   PetscInt            m, n;
2849ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2850ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2851ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2852ccdfe979SStefano Zampini 
2853ccdfe979SStefano Zampini   PetscFunctionBegin;
2854ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
285528b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2856ccdfe979SStefano Zampini   A = product->A;
2857ccdfe979SStefano Zampini   B = product->B;
28589566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
285928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2860ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
286108401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2862ccdfe979SStefano Zampini   switch (product->type) {
2863ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2864ccdfe979SStefano Zampini     m = A->rmap->n;
2865ccdfe979SStefano Zampini     n = B->cmap->n;
28660e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2867ccdfe979SStefano Zampini     break;
2868ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2869ccdfe979SStefano Zampini     m = A->cmap->n;
2870ccdfe979SStefano Zampini     n = B->cmap->n;
28710e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
28720e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2873ccdfe979SStefano Zampini     break;
2874ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2875ccdfe979SStefano Zampini     m = A->rmap->n;
2876ccdfe979SStefano Zampini     n = B->rmap->n;
28770e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
28780e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2879ccdfe979SStefano Zampini     break;
2880ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2881ccdfe979SStefano Zampini     m = B->cmap->n;
2882ccdfe979SStefano Zampini     n = B->cmap->n;
28830e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
28840e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2885ccdfe979SStefano Zampini     break;
2886ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2887ccdfe979SStefano Zampini     m = B->rmap->n;
2888ccdfe979SStefano Zampini     n = B->rmap->n;
28890e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
28900e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2891ccdfe979SStefano Zampini     break;
2892d71ae5a4SJacob Faibussowitsch   default:
2893d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2894ccdfe979SStefano Zampini   }
28959566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2896ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28979566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28989566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2899ccdfe979SStefano Zampini 
2900ccdfe979SStefano Zampini   /* product data */
29019566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2902ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2903afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2904afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
290548a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2906afb2bd1cSJunchao Zhang #endif
2907ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2908ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
29099566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
29109566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2911ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
29129566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2913ccdfe979SStefano Zampini     } else {
29149566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2915ccdfe979SStefano Zampini     }
2916ccdfe979SStefano Zampini   }
2917ccdfe979SStefano Zampini   C->product->data    = mmdata;
2918ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2919ccdfe979SStefano Zampini 
2920ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
29213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2922ccdfe979SStefano Zampini }
2923ccdfe979SStefano Zampini 
2924d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2925d71ae5a4SJacob Faibussowitsch {
2926ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2927fcdce8c4SStefano Zampini   Mat                           A, B;
2928fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2929fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2930fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2931fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2932fcdce8c4SStefano Zampini   PetscBool                     flg;
2933fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2934fcdce8c4SStefano Zampini   MatProductType                ptype;
2935fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2936fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2937fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2938fcdce8c4SStefano Zampini #endif
2939b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2940ccdfe979SStefano Zampini 
2941ccdfe979SStefano Zampini   PetscFunctionBegin;
2942ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
294328b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29449566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
294528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2946fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2947fcdce8c4SStefano Zampini   A      = product->A;
2948fcdce8c4SStefano Zampini   B      = product->B;
2949fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2950fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2951fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
295208401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2953fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
295428b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2955fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
295628b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2957fcdce8c4SStefano Zampini     goto finalize;
2958fcdce8c4SStefano Zampini   }
2959fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29609566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
296128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29629566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
296328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
296428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
296528b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2966fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2967fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2968fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
296908401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
297008401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
297108401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2974fcdce8c4SStefano Zampini 
2975fcdce8c4SStefano Zampini   ptype = product->type;
2976b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2977fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
297828b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2979fa046f9fSJunchao Zhang   }
2980b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2981fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
298228b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2983fa046f9fSJunchao Zhang   }
2984fcdce8c4SStefano Zampini   switch (ptype) {
2985fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2986fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2987fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2988fcdce8c4SStefano Zampini     break;
2989fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2990fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2991fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2992fcdce8c4SStefano Zampini     break;
2993fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2994fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2995fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2996fcdce8c4SStefano Zampini     break;
2997d71ae5a4SJacob Faibussowitsch   default:
2998d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2999fcdce8c4SStefano Zampini   }
3000fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
300128b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
300228b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
300328b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
3004fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3005fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
3006fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
300728b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
300828b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
300928b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
30109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3011fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3012fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
30139566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3014b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
30159371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30169371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3017b4285af6SJunchao Zhang   #else
30189371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30209371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3022b4285af6SJunchao Zhang   #endif
3023fcdce8c4SStefano Zampini #else
30249371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30259371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30269371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3027fcdce8c4SStefano Zampini #endif
30289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30299566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3031fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3032fcdce8c4SStefano Zampini finalize:
3033fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30349566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30359566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30369566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3037fcdce8c4SStefano Zampini   c->reallocs = 0;
3038fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3039fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3040fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3041fcdce8c4SStefano Zampini   C->num_ass++;
30423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3043ccdfe979SStefano Zampini }
3044fcdce8c4SStefano Zampini 
3045d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3046d71ae5a4SJacob Faibussowitsch {
3047fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3048fcdce8c4SStefano Zampini   Mat                           A, B;
3049fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3050fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3051fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3052fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3053fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3054fcdce8c4SStefano Zampini   PetscBool                     flg;
3055fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3056fcdce8c4SStefano Zampini   MatProductType                ptype;
3057fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3058fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3059fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3060fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3061fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3062fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3063fcdce8c4SStefano Zampini #else
3064fcdce8c4SStefano Zampini   int cnz;
3065fcdce8c4SStefano Zampini #endif
3066b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3067fcdce8c4SStefano Zampini 
3068fcdce8c4SStefano Zampini   PetscFunctionBegin;
3069fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
307028b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3071fcdce8c4SStefano Zampini   A = product->A;
3072fcdce8c4SStefano Zampini   B = product->B;
30739566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
307428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30759566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
307628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3077fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3078fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3079fcdce8c4SStefano Zampini   /* product data */
30809566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3081fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3082fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3083fcdce8c4SStefano Zampini 
30849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3086d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3087d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
308808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
308908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3090d60bce21SJunchao Zhang 
3091fcdce8c4SStefano Zampini   ptype = product->type;
3092b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3093fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3094fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3095fa046f9fSJunchao Zhang   }
3096b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3097fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3098fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3099fa046f9fSJunchao Zhang   }
3100fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3101fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3102fcdce8c4SStefano Zampini   switch (ptype) {
3103fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3104fcdce8c4SStefano Zampini     m    = A->rmap->n;
3105fcdce8c4SStefano Zampini     n    = B->cmap->n;
3106fcdce8c4SStefano Zampini     k    = A->cmap->n;
3107fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3108fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3109fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3110fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3111fcdce8c4SStefano Zampini     break;
3112fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3113fcdce8c4SStefano Zampini     m = A->cmap->n;
3114fcdce8c4SStefano Zampini     n = B->cmap->n;
3115fcdce8c4SStefano Zampini     k = A->rmap->n;
31169566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3117fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3118fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3119fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3120fcdce8c4SStefano Zampini     break;
3121fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3122fcdce8c4SStefano Zampini     m = A->rmap->n;
3123fcdce8c4SStefano Zampini     n = B->rmap->n;
3124fcdce8c4SStefano Zampini     k = A->cmap->n;
31259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3126fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3127fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3128fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3129fcdce8c4SStefano Zampini     break;
3130d71ae5a4SJacob Faibussowitsch   default:
3131d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3132fcdce8c4SStefano Zampini   }
3133fcdce8c4SStefano Zampini 
3134fcdce8c4SStefano Zampini   /* create cusparse matrix */
31359566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31369566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3137fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3138fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3139fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3140fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3141fcdce8c4SStefano Zampini 
3142fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3143fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3144fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31459566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31469566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3147fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3148fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3149fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3150fcdce8c4SStefano Zampini   } else {
3151fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3152fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3153fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3154fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3155fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3156fcdce8c4SStefano Zampini   }
3157fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3158fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3159fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3160fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3161fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3162fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31639566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31649566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3166f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3167f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3168f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31699566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31709566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3172fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3173d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3174fcdce8c4SStefano Zampini     c->nz                = 0;
3175fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3176fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3177fcdce8c4SStefano Zampini     goto finalizesym;
3178fcdce8c4SStefano Zampini   }
3179fcdce8c4SStefano Zampini 
318028b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
318128b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3182fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3183fcdce8c4SStefano Zampini   if (!biscompressed) {
3184fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3185fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3186fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3187fcdce8c4SStefano Zampini #endif
3188fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3189fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3190fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3191fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3192fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3193fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3194fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3195fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3196fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3197fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3198fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31999566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3200fcdce8c4SStefano Zampini     }
3201fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3202fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3203fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3204fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
32059371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32069371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3207fcdce8c4SStefano Zampini     }
3208fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3209fcdce8c4SStefano Zampini #endif
3210fcdce8c4SStefano Zampini   }
321128b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
321228b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3213fcdce8c4SStefano Zampini   /* precompute flops count */
3214fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3215fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3216fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3217fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3218fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3219fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3220fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3221fcdce8c4SStefano Zampini       }
3222fcdce8c4SStefano Zampini     }
3223fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3224fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3225fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3226fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3227fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3228fcdce8c4SStefano Zampini     }
3229fcdce8c4SStefano Zampini   } else { /* TODO */
3230fcdce8c4SStefano Zampini     flops = 0.;
3231fcdce8c4SStefano Zampini   }
3232fcdce8c4SStefano Zampini 
3233fcdce8c4SStefano Zampini   mmdata->flops = flops;
32349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3235b4285af6SJunchao Zhang 
3236fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32379566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32381ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32391ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32409371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3242b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3243b4285af6SJunchao Zhang   {
3244b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3245b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3246b4285af6SJunchao Zhang   */
3247b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3248b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3249b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3250b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3251b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3252b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3253b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3254b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3255b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3256b4285af6SJunchao Zhang 
3257b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32589371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32599371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3261b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32629371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32639371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3264b4285af6SJunchao Zhang 
32659371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32669371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32709371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32719371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3274b4285af6SJunchao Zhang 
3275b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32769566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3277b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3278b4285af6SJunchao Zhang     /* allocate matrix C */
32799371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32809371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32819371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32829371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3283b4285af6SJunchao Zhang     /* update matC with the new pointers */
32849371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32859371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3286b4285af6SJunchao Zhang 
32879371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32889371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32909371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32919371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32929566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32939371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32949371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32959566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3296b4285af6SJunchao Zhang   }
3297ae37ee31SJunchao Zhang   #else
3298b4285af6SJunchao Zhang   size_t bufSize2;
3299fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
33009371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
33019371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33029566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3303fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
33049371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
33059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3306fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
33079371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
33089371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3309fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3310fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3311fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3312fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3313fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
33149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3315fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
33169371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
33179371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3318fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
33199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3320fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
33219371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
33229371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3323fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3325fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33279371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33289371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33299371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33309371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3331ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3332fcdce8c4SStefano Zampini #else
33339566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33349371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33359371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33369371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3337fcdce8c4SStefano Zampini   c->nz                = cnz;
3338fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33399566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3340fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33419566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3342fcdce8c4SStefano Zampini 
33439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3344fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3345fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3346fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33479371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33489371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33499371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3350fcdce8c4SStefano Zampini #endif
33519566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33529566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3353fcdce8c4SStefano Zampini finalizesym:
3354fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33559f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33569f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3357fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33587de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3359fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3360fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3361fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3362fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3363fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3364fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3367fcdce8c4SStefano Zampini   } else {
3368fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3369fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3372fcdce8c4SStefano Zampini   }
3373fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3374fcdce8c4SStefano Zampini     PetscInt r = 0;
3375fcdce8c4SStefano Zampini     c->i[0]    = 0;
3376fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3377fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3378fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3379fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3380fcdce8c4SStefano Zampini     }
3381fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3382fcdce8c4SStefano Zampini   }
33839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33849566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33859566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3386fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3387fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3388fcdce8c4SStefano Zampini   c->rmax          = 0;
3389fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3390fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3391fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3392fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3393fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3394fcdce8c4SStefano Zampini   }
33959566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33969566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3397fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3398fcdce8c4SStefano Zampini 
3399fcdce8c4SStefano Zampini   C->nonzerostate++;
34009566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
34019566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3402fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3403fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3404fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3405fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3406fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3407abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3408fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3409fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3410fcdce8c4SStefano Zampini   }
3411fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
34123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3413fcdce8c4SStefano Zampini }
3414fcdce8c4SStefano Zampini 
3415fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3416fcdce8c4SStefano Zampini 
3417fcdce8c4SStefano Zampini /* handles sparse or dense B */
3418d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3419d71ae5a4SJacob Faibussowitsch {
3420fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3421fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3422fcdce8c4SStefano Zampini 
3423fcdce8c4SStefano Zampini   PetscFunctionBegin;
3424fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
34259566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
342648a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3427fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3428fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
342948a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3430fcdce8c4SStefano Zampini   }
343165e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
343265e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
343365e4b4d4SStefano Zampini     switch (product->type) {
343465e4b4d4SStefano Zampini     case MATPRODUCT_AB:
343565e4b4d4SStefano Zampini       if (product->api_user) {
3436d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3438d0609cedSBarry Smith         PetscOptionsEnd();
343965e4b4d4SStefano Zampini       } else {
3440d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34419566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3442d0609cedSBarry Smith         PetscOptionsEnd();
344365e4b4d4SStefano Zampini       }
344465e4b4d4SStefano Zampini       break;
344565e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
344665e4b4d4SStefano Zampini       if (product->api_user) {
3447d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3449d0609cedSBarry Smith         PetscOptionsEnd();
345065e4b4d4SStefano Zampini       } else {
3451d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34529566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3453d0609cedSBarry Smith         PetscOptionsEnd();
345465e4b4d4SStefano Zampini       }
345565e4b4d4SStefano Zampini       break;
345665e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
345765e4b4d4SStefano Zampini       if (product->api_user) {
3458d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3460d0609cedSBarry Smith         PetscOptionsEnd();
346165e4b4d4SStefano Zampini       } else {
3462d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34639566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3464d0609cedSBarry Smith         PetscOptionsEnd();
346565e4b4d4SStefano Zampini       }
346665e4b4d4SStefano Zampini       break;
346765e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
346865e4b4d4SStefano Zampini       if (product->api_user) {
3469d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34709566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3471d0609cedSBarry Smith         PetscOptionsEnd();
347265e4b4d4SStefano Zampini       } else {
3473d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34749566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3475d0609cedSBarry Smith         PetscOptionsEnd();
347665e4b4d4SStefano Zampini       }
347765e4b4d4SStefano Zampini       break;
347865e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
347965e4b4d4SStefano Zampini       if (product->api_user) {
3480d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34819566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3482d0609cedSBarry Smith         PetscOptionsEnd();
348365e4b4d4SStefano Zampini       } else {
3484d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34859566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3486d0609cedSBarry Smith         PetscOptionsEnd();
348765e4b4d4SStefano Zampini       }
348865e4b4d4SStefano Zampini       break;
3489d71ae5a4SJacob Faibussowitsch     default:
3490d71ae5a4SJacob Faibussowitsch       break;
349165e4b4d4SStefano Zampini     }
349265e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
349365e4b4d4SStefano Zampini   }
349465e4b4d4SStefano Zampini   /* dispatch */
3495fcdce8c4SStefano Zampini   if (isdense) {
3496ccdfe979SStefano Zampini     switch (product->type) {
3497ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3498ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3499ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3500ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3501ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3502fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
35039566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3504fcdce8c4SStefano Zampini       } else {
3505fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3506fcdce8c4SStefano Zampini       }
3507fcdce8c4SStefano Zampini       break;
3508d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3509d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3510d71ae5a4SJacob Faibussowitsch       break;
3511d71ae5a4SJacob Faibussowitsch     default:
3512d71ae5a4SJacob Faibussowitsch       break;
3513ccdfe979SStefano Zampini     }
3514fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3515fcdce8c4SStefano Zampini     switch (product->type) {
3516fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3517fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3518d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3519d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3520d71ae5a4SJacob Faibussowitsch       break;
3521fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3522fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3523d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3524d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3525d71ae5a4SJacob Faibussowitsch       break;
3526d71ae5a4SJacob Faibussowitsch     default:
3527d71ae5a4SJacob Faibussowitsch       break;
3528fcdce8c4SStefano Zampini     }
3529fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35309566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3531fcdce8c4SStefano Zampini   }
35323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3533ccdfe979SStefano Zampini }
3534ccdfe979SStefano Zampini 
3535d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3536d71ae5a4SJacob Faibussowitsch {
35379ae82921SPaul Mullowney   PetscFunctionBegin;
35389566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3540e6e9a74fSStefano Zampini }
3541e6e9a74fSStefano Zampini 
3542d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3543d71ae5a4SJacob Faibussowitsch {
3544e6e9a74fSStefano Zampini   PetscFunctionBegin;
35459566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3547e6e9a74fSStefano Zampini }
3548e6e9a74fSStefano Zampini 
3549d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3550d71ae5a4SJacob Faibussowitsch {
3551e6e9a74fSStefano Zampini   PetscFunctionBegin;
35529566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3554e6e9a74fSStefano Zampini }
3555e6e9a74fSStefano Zampini 
3556d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3557d71ae5a4SJacob Faibussowitsch {
3558e6e9a74fSStefano Zampini   PetscFunctionBegin;
35599566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35619ae82921SPaul Mullowney }
35629ae82921SPaul Mullowney 
3563d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3564d71ae5a4SJacob Faibussowitsch {
3565ca45077fSPaul Mullowney   PetscFunctionBegin;
35669566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3568ca45077fSPaul Mullowney }
3569ca45077fSPaul Mullowney 
3570d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3571d71ae5a4SJacob Faibussowitsch {
3572a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3573a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3574a0e72f99SJunchao Zhang }
3575a0e72f99SJunchao Zhang 
3576afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3577d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3578d71ae5a4SJacob Faibussowitsch {
35799ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3580aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35819ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3582e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3583e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3584e6e9a74fSStefano Zampini   PetscBool                     compressed;
3585afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3586afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3587afb2bd1cSJunchao Zhang #endif
35886e111a19SKarl Rupp 
35899ae82921SPaul Mullowney   PetscFunctionBegin;
359008401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3591cbc6b225SStefano Zampini   if (!a->nz) {
3592995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3593995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35943ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3595e6e9a74fSStefano Zampini   }
359634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3598e6e9a74fSStefano Zampini   if (!trans) {
35999ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
36005f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3601e6e9a74fSStefano Zampini   } else {
36021a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3603e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3604e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3605e6e9a74fSStefano Zampini     } else {
36069566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3607e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3608e6e9a74fSStefano Zampini     }
3609e6e9a74fSStefano Zampini   }
3610e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3611e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3612213423ffSJunchao Zhang 
3613e6e9a74fSStefano Zampini   try {
36149566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
361569d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
36169566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3617afb2bd1cSJunchao Zhang 
36189566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3619e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3620afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3621afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3622afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3623afb2bd1cSJunchao Zhang       */
3624e6e9a74fSStefano Zampini       xptr = xarray;
3625afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3626213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3627afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3628afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3629afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3630afb2bd1cSJunchao Zhang        */
3631afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3632afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3633fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3634afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3635afb2bd1cSJunchao Zhang       }
3636afb2bd1cSJunchao Zhang #endif
3637e6e9a74fSStefano Zampini     } else {
3638afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3639afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3640afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3641afb2bd1cSJunchao Zhang        */
3642afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3643e6e9a74fSStefano Zampini       dptr = zarray;
3644e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3645afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3646e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3647d0967f54SJacob Faibussowitsch 
3648d0967f54SJacob Faibussowitsch         thrust::for_each(
3649d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3650d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3651d0967f54SJacob Faibussowitsch #endif
3652d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36539371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3654e6e9a74fSStefano Zampini       }
3655afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3656afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3657afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3658fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3659afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3660afb2bd1cSJunchao Zhang       }
3661afb2bd1cSJunchao Zhang #endif
3662e6e9a74fSStefano Zampini     }
36639ae82921SPaul Mullowney 
3664afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3665aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3666afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3667fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3668fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3669fe5544b9SJunchao Zhang   #else
3670fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3671fe5544b9SJunchao Zhang   #endif
3672fe5544b9SJunchao Zhang 
36735f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3674fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3675fe5544b9SJunchao Zhang       if (!matDescr) {
3676fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3677fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3678fe5544b9SJunchao Zhang       }
3679fe5544b9SJunchao Zhang   #endif
3680fe5544b9SJunchao Zhang 
3681afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36849371c9d4SSatish Balay         PetscCallCUSPARSE(
3685fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3687fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3688fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3689fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3690fe5544b9SJunchao Zhang   #endif
3691afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3692afb2bd1cSJunchao Zhang       } else {
3693afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3696afb2bd1cSJunchao Zhang       }
3697afb2bd1cSJunchao Zhang 
3698fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3699afb2bd1cSJunchao Zhang #else
37007656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
37019371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3702afb2bd1cSJunchao Zhang #endif
3703aa372e3fSPaul Mullowney     } else {
3704213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3705afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3706afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3707afb2bd1cSJunchao Zhang #else
3708301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
37099371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3710afb2bd1cSJunchao Zhang #endif
3711a65300a6SPaul Mullowney       }
3712aa372e3fSPaul Mullowney     }
37139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3714aa372e3fSPaul Mullowney 
3715e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3716213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3717213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3718995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3719e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3720995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
37217656d835SStefano Zampini         }
3722213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3723995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
37247656d835SStefano Zampini       }
37257656d835SStefano Zampini 
3726213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3727213423ffSJunchao Zhang       if (compressed) {
37289566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3729da81f932SPierre Jolivet         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3730a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3731a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3732a0e72f99SJunchao Zhang          */
3733a0e72f99SJunchao Zhang #if 0
3734a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3735a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3736a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3737e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3738c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3739a0e72f99SJunchao Zhang #else
37406497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37416497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3742a0e72f99SJunchao Zhang #endif
37439566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3744e6e9a74fSStefano Zampini       }
3745e6e9a74fSStefano Zampini     } else {
3746995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3747e6e9a74fSStefano Zampini     }
37489566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37499566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37509566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3751d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3752d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3753d71ae5a4SJacob Faibussowitsch   }
3754e6e9a74fSStefano Zampini   if (yy) {
37559566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3756e6e9a74fSStefano Zampini   } else {
37579566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3758e6e9a74fSStefano Zampini   }
37593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37609ae82921SPaul Mullowney }
37619ae82921SPaul Mullowney 
3762d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3763d71ae5a4SJacob Faibussowitsch {
3764ca45077fSPaul Mullowney   PetscFunctionBegin;
37659566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3767ca45077fSPaul Mullowney }
3768ca45077fSPaul Mullowney 
3769d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3770d71ae5a4SJacob Faibussowitsch {
3771042217e8SBarry Smith   PetscFunctionBegin;
37729566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37749ae82921SPaul Mullowney }
37759ae82921SPaul Mullowney 
3776e057df02SPaul Mullowney /*@
377711a5261eSBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
37782920cce0SJacob Faibussowitsch   (the default parallel PETSc format).
37799ae82921SPaul Mullowney 
3780d083f849SBarry Smith   Collective
37819ae82921SPaul Mullowney 
37829ae82921SPaul Mullowney   Input Parameters:
378311a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37849ae82921SPaul Mullowney . m    - number of rows
37859ae82921SPaul Mullowney . n    - number of columns
378620f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
378720f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37889ae82921SPaul Mullowney 
37899ae82921SPaul Mullowney   Output Parameter:
37909ae82921SPaul Mullowney . A - the matrix
37919ae82921SPaul Mullowney 
37922ef1f0ffSBarry Smith   Level: intermediate
37932ef1f0ffSBarry Smith 
37942ef1f0ffSBarry Smith   Notes:
37952920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37962920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
37972920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
37982920cce0SJacob Faibussowitsch 
379911a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
38009ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
380111a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
38029ae82921SPaul Mullowney 
380311a5261eSBarry Smith   The AIJ format, also called
38042ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
38059ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
380620f4b53cSBarry Smith   either one (as in Fortran) or zero.
38079ae82921SPaul Mullowney 
38089ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
38092ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
381020f4b53cSBarry Smith   allocation.
38119ae82921SPaul Mullowney 
3812fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
38139ae82921SPaul Mullowney @*/
3814d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3815d71ae5a4SJacob Faibussowitsch {
38169ae82921SPaul Mullowney   PetscFunctionBegin;
38179566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
38189566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
38199566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
38209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
38213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38229ae82921SPaul Mullowney }
38239ae82921SPaul Mullowney 
3824d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3825d71ae5a4SJacob Faibussowitsch {
38269ae82921SPaul Mullowney   PetscFunctionBegin;
38279ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38282c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38299ae82921SPaul Mullowney   } else {
38309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3831aa372e3fSPaul Mullowney   }
38329566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38339566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38349566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38359566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38369566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38379566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38389566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38429566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38449ae82921SPaul Mullowney }
38459ae82921SPaul Mullowney 
3846ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
384795639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3848d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3849d71ae5a4SJacob Faibussowitsch {
38509ff858a8SKarl Rupp   PetscFunctionBegin;
38519566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38529566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38549ff858a8SKarl Rupp }
38559ff858a8SKarl Rupp 
3856d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3857d71ae5a4SJacob Faibussowitsch {
3858a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3859039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3860039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3861039c6fbaSStefano Zampini   PetscScalar        *ay;
3862039c6fbaSStefano Zampini   const PetscScalar  *ax;
3863039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3864e6e9a74fSStefano Zampini 
386595639643SRichard Tran Mills   PetscFunctionBegin;
3866a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3867a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3868039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38709566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38713ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
387295639643SRichard Tran Mills   }
3873039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38765f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38775f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3878039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3879039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3880039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3881039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3882039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3883ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3884039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3885039c6fbaSStefano Zampini   }
3886d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3887d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3888039c6fbaSStefano Zampini 
3889039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3890039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3891039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3892039c6fbaSStefano Zampini     size_t bufferSize;
3893039c6fbaSStefano Zampini     void  *buffer;
3894039c6fbaSStefano Zampini #endif
3895039c6fbaSStefano Zampini 
38969566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38989566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3899039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
39009371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39019371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
39029566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
39039566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39049371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39059371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
39069566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39079566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3909039c6fbaSStefano Zampini #else
39109566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39119371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
39129371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
39139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
39149566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3915039c6fbaSStefano Zampini #endif
39169566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
39179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39199566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3920039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3921a587d139SMark     cublasHandle_t cublasv2handle;
3922a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3923039c6fbaSStefano Zampini 
39249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39269566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39279566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39289566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39299566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39309566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39319566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3935039c6fbaSStefano Zampini   } else {
39369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39379566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3938a587d139SMark   }
39393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
394095639643SRichard Tran Mills }
394195639643SRichard Tran Mills 
3942d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3943d71ae5a4SJacob Faibussowitsch {
394433c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
394533c9ba73SStefano Zampini   PetscScalar   *ay;
394633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
394733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
394833c9ba73SStefano Zampini 
394933c9ba73SStefano Zampini   PetscFunctionBegin;
39509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39519566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39529566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39549566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
39593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
396033c9ba73SStefano Zampini }
396133c9ba73SStefano Zampini 
3962d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3963d71ae5a4SJacob Faibussowitsch {
39647e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3965a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39667e8381f9SStefano Zampini 
39673fa6b06aSMark Adams   PetscFunctionBegin;
39683fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39693fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39707e8381f9SStefano Zampini     if (spptr->mat) {
39717e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39727e8381f9SStefano Zampini       if (matrix->values) {
39737e8381f9SStefano Zampini         both = PETSC_TRUE;
39747e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39757e8381f9SStefano Zampini       }
39767e8381f9SStefano Zampini     }
39777e8381f9SStefano Zampini     if (spptr->matTranspose) {
39787e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3979ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39807e8381f9SStefano Zampini     }
39813fa6b06aSMark Adams   }
39829566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39847e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3985a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39873fa6b06aSMark Adams }
39883fa6b06aSMark Adams 
3989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3990d71ae5a4SJacob Faibussowitsch {
3991a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3992a587d139SMark 
3993a587d139SMark   PetscFunctionBegin;
39949a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39959a14fc28SStefano Zampini     A->boundtocpu = flg;
39963ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39979a14fc28SStefano Zampini   }
3998a587d139SMark   if (flg) {
39999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4000a587d139SMark 
400133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
4002a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
4003a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4004a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
4005a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4006a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4007a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4008a587d139SMark     A->ops->multhermitiantranspose    = NULL;
4009a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4010fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
40119566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
40129566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
40139566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
40149566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
40159566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
40169566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
40179566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
4018a587d139SMark   } else {
401933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4020a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4021a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4022a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4023a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4024a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4025a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4026a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4027a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4028fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
402967a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
403067a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
403167a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
403267a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
403367a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
403467a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40357ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40367ee59b9bSJunchao Zhang 
40379566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40389566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40399566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40409566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40419566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40429566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4043a587d139SMark   }
4044a587d139SMark   A->boundtocpu = flg;
4045ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
4046ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4047ea500dcfSRichard Tran Mills   } else {
4048ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4049ea500dcfSRichard Tran Mills   }
40503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4051a587d139SMark }
4052a587d139SMark 
40538eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4054d71ae5a4SJacob Faibussowitsch {
405549735bf3SStefano Zampini   Mat B;
40569ae82921SPaul Mullowney 
40579ae82921SPaul Mullowney   PetscFunctionBegin;
40589566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
405949735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40609566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
406149735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40629566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
406349735bf3SStefano Zampini   }
406449735bf3SStefano Zampini   B = *newmat;
406549735bf3SStefano Zampini 
40669566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40679566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
406834136279SStefano Zampini 
406949735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40709ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4071e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40729566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40739566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40749566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40751a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4076d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4077b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4078a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4079a435da06SStefano Zampini   #else
4080d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4081a435da06SStefano Zampini   #endif
4082d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4083d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4084d8132acaSStefano Zampini #endif
40851a2c6b5cSJunchao Zhang       B->spptr = spptr;
40869ae82921SPaul Mullowney     } else {
4087e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4088e6e9a74fSStefano Zampini 
40899566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40909566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40919566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4092e6e9a74fSStefano Zampini       B->spptr = spptr;
40939ae82921SPaul Mullowney     }
4094e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
409549735bf3SStefano Zampini   }
4096693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
40979ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
40981a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
40999ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
410095639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4101693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
41022205254eSKarl Rupp 
41039566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
41049566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
41059566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4106ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
41079566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4108ae48a8d0SStefano Zampini #endif
41099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
41103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41119ae82921SPaul Mullowney }
41129ae82921SPaul Mullowney 
4113d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4114d71ae5a4SJacob Faibussowitsch {
411502fe1965SBarry Smith   PetscFunctionBegin;
41169566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
41179566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
41183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
411902fe1965SBarry Smith }
412002fe1965SBarry Smith 
41213ca39a21SBarry Smith /*MC
4122e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4123e057df02SPaul Mullowney 
412415229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
412511a5261eSBarry Smith    CSR, ELL, or Hybrid format.
412611a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4127e057df02SPaul Mullowney 
4128e057df02SPaul Mullowney    Options Database Keys:
412911a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
41302ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41312ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
41322ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
41332ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4134e057df02SPaul Mullowney 
4135e057df02SPaul Mullowney   Level: beginner
4136e057df02SPaul Mullowney 
41371cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4138e057df02SPaul Mullowney M*/
41397f756511SDominic Meiser 
4140d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4141d71ae5a4SJacob Faibussowitsch {
414242c9c57cSBarry Smith   PetscFunctionBegin;
41439566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41449566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41459566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41469566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
414842c9c57cSBarry Smith }
414929b38603SBarry Smith 
41502c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4151d71ae5a4SJacob Faibussowitsch {
41522c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4153cbc6b225SStefano Zampini 
4154cbc6b225SStefano Zampini   PetscFunctionBegin;
41552c4ab24aSJunchao Zhang   if (cusp) {
41562c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41572c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41582c4ab24aSJunchao Zhang     delete cusp->workVector;
41592c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41602c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41612c4ab24aSJunchao Zhang     delete cusp->coords;
41622c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41632c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41647f756511SDominic Meiser   }
41653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41667f756511SDominic Meiser }
41677f756511SDominic Meiser 
4168d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4169d71ae5a4SJacob Faibussowitsch {
41707f756511SDominic Meiser   PetscFunctionBegin;
41717f756511SDominic Meiser   if (*mat) {
41727f756511SDominic Meiser     delete (*mat)->values;
41737f756511SDominic Meiser     delete (*mat)->column_indices;
41747f756511SDominic Meiser     delete (*mat)->row_offsets;
41757f756511SDominic Meiser     delete *mat;
41767f756511SDominic Meiser     *mat = 0;
41777f756511SDominic Meiser   }
41783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41797f756511SDominic Meiser }
41807f756511SDominic Meiser 
4181b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4182d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4183d71ae5a4SJacob Faibussowitsch {
41847f756511SDominic Meiser   PetscFunctionBegin;
41857f756511SDominic Meiser   if (*trifactor) {
41869566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4187261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41889566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41899566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41909566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4191afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41929566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4193afb2bd1cSJunchao Zhang   #endif
41949566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41957f756511SDominic Meiser   }
41963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41977f756511SDominic Meiser }
4198d460d7bfSJunchao Zhang #endif
41997f756511SDominic Meiser 
4200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4201d71ae5a4SJacob Faibussowitsch {
42027f756511SDominic Meiser   CsrMatrix *mat;
42037f756511SDominic Meiser 
42047f756511SDominic Meiser   PetscFunctionBegin;
42057f756511SDominic Meiser   if (*matstruct) {
42067f756511SDominic Meiser     if ((*matstruct)->mat) {
42077f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4208afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4209afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4210afb2bd1cSJunchao Zhang #else
42117f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
42129566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4213afb2bd1cSJunchao Zhang #endif
42147f756511SDominic Meiser       } else {
42157f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
42163ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
42177f756511SDominic Meiser       }
42187f756511SDominic Meiser     }
42199566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
42207f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
42219566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
42229566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42239566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4224afb2bd1cSJunchao Zhang 
4225afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4226afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42279566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4228fe5544b9SJunchao Zhang 
4229afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4230afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4234fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4235fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4236fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4237fe5544b9SJunchao Zhang   #endif
4238afb2bd1cSJunchao Zhang       }
4239afb2bd1cSJunchao Zhang     }
4240afb2bd1cSJunchao Zhang #endif
42417f756511SDominic Meiser     delete *matstruct;
42427e8381f9SStefano Zampini     *matstruct = NULL;
42437f756511SDominic Meiser   }
42443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42457f756511SDominic Meiser }
42467f756511SDominic Meiser 
4247d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4248d71ae5a4SJacob Faibussowitsch {
4249da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4250da112707SJunchao Zhang 
42517f756511SDominic Meiser   PetscFunctionBegin;
4252da112707SJunchao Zhang   if (fs) {
4253b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4254da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4255da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4256da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4257da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4258d460d7bfSJunchao Zhang     delete fs->workVector;
4259d460d7bfSJunchao Zhang     fs->workVector = NULL;
4260d460d7bfSJunchao Zhang #endif
4261da112707SJunchao Zhang     delete fs->rpermIndices;
4262da112707SJunchao Zhang     delete fs->cpermIndices;
4263da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4264da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4265da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4266b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4267da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4268da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
426930807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
427030807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4271da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4272d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4273da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4274da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
427512ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4276da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4277da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
427812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4279da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4280da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4281da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4282da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4283da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4284da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4285da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4286da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4287da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4288da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4289da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4290da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4291d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4292d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4293d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
429412ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
429512ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4296da112707SJunchao Zhang #endif
4297ccdfe979SStefano Zampini   }
42983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4299ccdfe979SStefano Zampini }
4300ccdfe979SStefano Zampini 
4301d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4302d71ae5a4SJacob Faibussowitsch {
4303ccdfe979SStefano Zampini   PetscFunctionBegin;
4304ccdfe979SStefano Zampini   if (*trifactors) {
43059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4306f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
43079566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
43087f756511SDominic Meiser   }
43093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43107f756511SDominic Meiser }
43117e8381f9SStefano Zampini 
43129371c9d4SSatish Balay struct IJCompare {
4313d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4314d71ae5a4SJacob Faibussowitsch   {
43150b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
43160b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
43177e8381f9SStefano Zampini     return false;
43187e8381f9SStefano Zampini   }
43197e8381f9SStefano Zampini };
43207e8381f9SStefano Zampini 
432166976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4322d71ae5a4SJacob Faibussowitsch {
4323a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4324a49f1ed0SStefano Zampini 
4325a49f1ed0SStefano Zampini   PetscFunctionBegin;
4326a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43273ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4328a49f1ed0SStefano Zampini   if (destroy) {
43299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4330a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4331a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4332a49f1ed0SStefano Zampini   }
43331a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4335a49f1ed0SStefano Zampini }
4336a49f1ed0SStefano Zampini 
433749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4338d71ae5a4SJacob Faibussowitsch {
433949abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43404d86920dSPierre Jolivet 
43417e8381f9SStefano Zampini   PetscFunctionBegin;
43422c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43432c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43442c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43467e8381f9SStefano Zampini }
4347ed502f03SStefano Zampini 
434866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4349d71ae5a4SJacob Faibussowitsch {
43502c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43512c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43522c4ab24aSJunchao Zhang   PetscInt            *i, *j;
435303e76207SPierre Jolivet   PetscContainer       container_h;
43542c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4355219fbbafSJunchao Zhang 
4356219fbbafSJunchao Zhang   PetscFunctionBegin;
43579566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43582c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43592c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43602c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43612c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43622c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43632c4ab24aSJunchao Zhang   } else {
43642c4ab24aSJunchao Zhang     i = coo_i;
43652c4ab24aSJunchao Zhang     j = coo_j;
4366219fbbafSJunchao Zhang   }
4367219fbbafSJunchao Zhang 
43682c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43692c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4370cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43712c4ab24aSJunchao Zhang   // Create the GPU memory
43729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43732c4ab24aSJunchao Zhang 
43742c4ab24aSJunchao Zhang   // Copy the COO struct to device
43752c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43762c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43772c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43782c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43792c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
43802c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43812c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
43822c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
43832c4ab24aSJunchao Zhang 
43842c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
438503e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
43863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4387219fbbafSJunchao Zhang }
4388219fbbafSJunchao Zhang 
4389d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4390d71ae5a4SJacob Faibussowitsch {
4391219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4392219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4393b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4394b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4395b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4396b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4397b6c38306SJunchao Zhang   }
4398219fbbafSJunchao Zhang }
4399219fbbafSJunchao Zhang 
440066976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4401d71ae5a4SJacob Faibussowitsch {
4402219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4403219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4404219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4405219fbbafSJunchao Zhang   PetscMemType         memtype;
4406219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4407219fbbafSJunchao Zhang   PetscScalar         *Aa;
44082c4ab24aSJunchao Zhang   PetscContainer       container;
44092c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4410219fbbafSJunchao Zhang 
4411219fbbafSJunchao Zhang   PetscFunctionBegin;
44122c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44132c4ab24aSJunchao Zhang 
44142c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
44152c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
44162c4ab24aSJunchao Zhang 
44179566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4418219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44192c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
44202c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4421219fbbafSJunchao Zhang   }
4422219fbbafSJunchao Zhang 
44239566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44249566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4425219fbbafSJunchao Zhang 
442608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4427cbc6b225SStefano Zampini   if (Annz) {
44286497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4430cbc6b225SStefano Zampini   }
443108bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4432219fbbafSJunchao Zhang 
44339566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44349566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4435219fbbafSJunchao Zhang 
44369566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4438219fbbafSJunchao Zhang }
4439219fbbafSJunchao Zhang 
44405b7e41feSStefano Zampini /*@C
44412ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44425b7e41feSStefano Zampini 
44432ef1f0ffSBarry Smith   Not Collective
44445b7e41feSStefano Zampini 
44455b7e41feSStefano Zampini   Input Parameters:
44465b7e41feSStefano Zampini + A          - the matrix
444711a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44485b7e41feSStefano Zampini 
44495b7e41feSStefano Zampini   Output Parameters:
445020f4b53cSBarry Smith + i - the CSR row pointers
445120f4b53cSBarry Smith - j - the CSR column indices
44525b7e41feSStefano Zampini 
44535b7e41feSStefano Zampini   Level: developer
44545b7e41feSStefano Zampini 
445511a5261eSBarry Smith   Note:
44565b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44575b7e41feSStefano Zampini 
44581cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44595b7e41feSStefano Zampini @*/
4460d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4461d71ae5a4SJacob Faibussowitsch {
44625f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44635f101d05SStefano Zampini   CsrMatrix          *csr;
44645f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44655f101d05SStefano Zampini 
44665f101d05SStefano Zampini   PetscFunctionBegin;
44675f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44683ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44695f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
447228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44735f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44745f101d05SStefano Zampini   if (i) {
44755f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44765f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44775f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44785f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44799566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44805f101d05SStefano Zampini       }
44815f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44825f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44835f101d05SStefano Zampini   }
44845f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44865f101d05SStefano Zampini }
44875f101d05SStefano Zampini 
44885b7e41feSStefano Zampini /*@C
44892ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44905b7e41feSStefano Zampini 
44912ef1f0ffSBarry Smith   Not Collective
44925b7e41feSStefano Zampini 
44935b7e41feSStefano Zampini   Input Parameters:
44945b7e41feSStefano Zampini + A          - the matrix
44952ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
449620f4b53cSBarry Smith . i          - the CSR row pointers
449720f4b53cSBarry Smith - j          - the CSR column indices
44985b7e41feSStefano Zampini 
44995b7e41feSStefano Zampini   Level: developer
45005b7e41feSStefano Zampini 
45011cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
45025b7e41feSStefano Zampini @*/
450320f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4504d71ae5a4SJacob Faibussowitsch {
45055f101d05SStefano Zampini   PetscFunctionBegin;
45065f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45075f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45085f101d05SStefano Zampini   if (i) *i = NULL;
45095f101d05SStefano Zampini   if (j) *j = NULL;
451020f4b53cSBarry Smith   (void)compressed;
45113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45125f101d05SStefano Zampini }
45135f101d05SStefano Zampini 
45145b7e41feSStefano Zampini /*@C
451511a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45165b7e41feSStefano Zampini 
45175b7e41feSStefano Zampini   Not Collective
45185b7e41feSStefano Zampini 
45195b7e41feSStefano Zampini   Input Parameter:
452011a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45215b7e41feSStefano Zampini 
45225b7e41feSStefano Zampini   Output Parameter:
45235b7e41feSStefano Zampini . a - pointer to the device data
45245b7e41feSStefano Zampini 
45255b7e41feSStefano Zampini   Level: developer
45265b7e41feSStefano Zampini 
452711a5261eSBarry Smith   Note:
452811a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45295b7e41feSStefano Zampini 
45301cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45315b7e41feSStefano Zampini @*/
4532d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4533d71ae5a4SJacob Faibussowitsch {
4534ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4535ed502f03SStefano Zampini   CsrMatrix          *csr;
4536ed502f03SStefano Zampini 
4537ed502f03SStefano Zampini   PetscFunctionBegin;
4538ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45394f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4540ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4541aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
454328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4544ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
454528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4546ed502f03SStefano Zampini   *a = csr->values->data().get();
45473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4548ed502f03SStefano Zampini }
4549ed502f03SStefano Zampini 
45505b7e41feSStefano Zampini /*@C
455111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45525b7e41feSStefano Zampini 
45535b7e41feSStefano Zampini   Not Collective
45545b7e41feSStefano Zampini 
45552ef1f0ffSBarry Smith   Input Parameters:
45562ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45572ef1f0ffSBarry Smith - a - pointer to the device data
45585b7e41feSStefano Zampini 
45595b7e41feSStefano Zampini   Level: developer
45605b7e41feSStefano Zampini 
45611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45625b7e41feSStefano Zampini @*/
4563d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4564d71ae5a4SJacob Faibussowitsch {
4565ed502f03SStefano Zampini   PetscFunctionBegin;
4566ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45674f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4568ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4569ed502f03SStefano Zampini   *a = NULL;
45703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4571ed502f03SStefano Zampini }
4572ed502f03SStefano Zampini 
45735b7e41feSStefano Zampini /*@C
457411a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45755b7e41feSStefano Zampini 
45765b7e41feSStefano Zampini   Not Collective
45775b7e41feSStefano Zampini 
45785b7e41feSStefano Zampini   Input Parameter:
457911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45805b7e41feSStefano Zampini 
45815b7e41feSStefano Zampini   Output Parameter:
45825b7e41feSStefano Zampini . a - pointer to the device data
45835b7e41feSStefano Zampini 
45845b7e41feSStefano Zampini   Level: developer
45855b7e41feSStefano Zampini 
458611a5261eSBarry Smith   Note:
458711a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45885b7e41feSStefano Zampini 
45891cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45905b7e41feSStefano Zampini @*/
4591d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4592d71ae5a4SJacob Faibussowitsch {
4593039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4594039c6fbaSStefano Zampini   CsrMatrix          *csr;
4595039c6fbaSStefano Zampini 
4596039c6fbaSStefano Zampini   PetscFunctionBegin;
4597039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45984f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4599039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4600aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
46019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
460228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4603039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
460428b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4605039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4606039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4609039c6fbaSStefano Zampini }
46105b7e41feSStefano Zampini /*@C
461111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4612039c6fbaSStefano Zampini 
46135b7e41feSStefano Zampini   Not Collective
46145b7e41feSStefano Zampini 
46152ef1f0ffSBarry Smith   Input Parameters:
46162ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46172ef1f0ffSBarry Smith - a - pointer to the device data
46185b7e41feSStefano Zampini 
46195b7e41feSStefano Zampini   Level: developer
46205b7e41feSStefano Zampini 
46211cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46225b7e41feSStefano Zampini @*/
4623d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4624d71ae5a4SJacob Faibussowitsch {
4625039c6fbaSStefano Zampini   PetscFunctionBegin;
4626039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46274f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4628039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46309566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4631039c6fbaSStefano Zampini   *a = NULL;
46323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4633039c6fbaSStefano Zampini }
4634039c6fbaSStefano Zampini 
46355b7e41feSStefano Zampini /*@C
463611a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46375b7e41feSStefano Zampini 
46385b7e41feSStefano Zampini   Not Collective
46395b7e41feSStefano Zampini 
46405b7e41feSStefano Zampini   Input Parameter:
464111a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46425b7e41feSStefano Zampini 
46435b7e41feSStefano Zampini   Output Parameter:
46445b7e41feSStefano Zampini . a - pointer to the device data
46455b7e41feSStefano Zampini 
46465b7e41feSStefano Zampini   Level: developer
46475b7e41feSStefano Zampini 
464811a5261eSBarry Smith   Note:
464911a5261eSBarry Smith   Does not trigger host-device copies and flags data validity on the GPU
46505b7e41feSStefano Zampini 
46511cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46525b7e41feSStefano Zampini @*/
4653d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4654d71ae5a4SJacob Faibussowitsch {
4655ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4656ed502f03SStefano Zampini   CsrMatrix          *csr;
4657ed502f03SStefano Zampini 
4658ed502f03SStefano Zampini   PetscFunctionBegin;
4659ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46604f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4661ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4662aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
466328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4664ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
466528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4666ed502f03SStefano Zampini   *a             = csr->values->data().get();
4667039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4670ed502f03SStefano Zampini }
4671ed502f03SStefano Zampini 
46725b7e41feSStefano Zampini /*@C
467311a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46745b7e41feSStefano Zampini 
46755b7e41feSStefano Zampini   Not Collective
46765b7e41feSStefano Zampini 
46772ef1f0ffSBarry Smith   Input Parameters:
46782ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46792ef1f0ffSBarry Smith - a - pointer to the device data
46805b7e41feSStefano Zampini 
46815b7e41feSStefano Zampini   Level: developer
46825b7e41feSStefano Zampini 
46831cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
46845b7e41feSStefano Zampini @*/
4685d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4686d71ae5a4SJacob Faibussowitsch {
4687ed502f03SStefano Zampini   PetscFunctionBegin;
4688ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46894f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4690ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46929566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4693ed502f03SStefano Zampini   *a = NULL;
46943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4695ed502f03SStefano Zampini }
4696ed502f03SStefano Zampini 
46979371c9d4SSatish Balay struct IJCompare4 {
4698d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4699d71ae5a4SJacob Faibussowitsch   {
47000b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
47010b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4702ed502f03SStefano Zampini     return false;
4703ed502f03SStefano Zampini   }
4704ed502f03SStefano Zampini };
4705ed502f03SStefano Zampini 
47069371c9d4SSatish Balay struct Shift {
4707ed502f03SStefano Zampini   int _shift;
4708ed502f03SStefano Zampini 
4709ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
47109371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4711ed502f03SStefano Zampini };
4712ed502f03SStefano Zampini 
471321afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4714d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4715d71ae5a4SJacob Faibussowitsch {
4716ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4717ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4718ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4719ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4720ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4721ed502f03SStefano Zampini   cusparseStatus_t              stat;
4722ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4723ed502f03SStefano Zampini 
4724ed502f03SStefano Zampini   PetscFunctionBegin;
4725ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4726ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47274f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4728ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4729ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47305f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
473108401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4732aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4733aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4734ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4735ed502f03SStefano Zampini     m = A->rmap->n;
4736ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47379566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47389566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47399566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4740ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4741ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4742ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4743ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4744ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4745ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4746ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4747ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4748ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4749ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4750ed502f03SStefano Zampini     Ccusp->nrows            = m;
4751ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4752ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4753ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4754ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47559566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4758f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4759f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4760f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
476628b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
476728b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4768ed502f03SStefano Zampini 
4769ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4770ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4771ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4772ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4773ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4774ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4775ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4776ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4777ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47782c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4779ed502f03SStefano Zampini     if (c->nz) {
47802ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47812ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47822ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47832ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47842ed87e7eSStefano Zampini 
4785ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4786ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4787ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4788ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47899566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4790ed502f03SStefano Zampini         }
47912ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47922ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4793ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4794ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4795ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4796ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47979566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4798ed502f03SStefano Zampini         }
47992ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
48002ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
48019566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48029371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48039371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48049371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48059371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48062ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
48072ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
48082ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
48098909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4810ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4811ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48128909a122SStefano Zampini #else
48138909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48148909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48158909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48168909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48178909a122SStefano Zampini #endif
48182ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48192ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48202ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48212ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48222ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48232ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48242c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48252c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4826ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4827792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48288909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48298909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48308909a122SStefano Zampini #endif
48312ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48322ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48332ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4834792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48352ed87e7eSStefano Zampini #else
48362ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4837792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4838792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48392ed87e7eSStefano Zampini #endif
48409371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48419371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48429566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48432ed87e7eSStefano Zampini       delete wPerm;
48442ed87e7eSStefano Zampini       delete Acoo;
48452ed87e7eSStefano Zampini       delete Bcoo;
48462ed87e7eSStefano Zampini       delete Ccoo;
4847ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48489371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48499371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4850ed502f03SStefano Zampini #endif
48511a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48529566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48539566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4854ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4855ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4856ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4857ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4858ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4859ed502f03SStefano Zampini 
48601a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48611a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4862a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4863ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4864ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4865ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4866ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4867ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4868ed502f03SStefano Zampini 
4869ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4870ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4871ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4872ed502f03SStefano Zampini 
48739566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4874ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4875ed502f03SStefano Zampini         if (AT) {
4876ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4877ed502f03SStefano Zampini           thrust::advance(rT, -1);
4878ed502f03SStefano Zampini         }
4879ed502f03SStefano Zampini         if (BT) {
4880ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4881ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4882ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4883ed502f03SStefano Zampini         }
4884ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4885ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4886ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4887ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4888ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4889ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48909566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4891ed502f03SStefano Zampini 
48929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4895f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4896f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4897f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
48989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48999566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4901ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
49029371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
49039371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4904ed502f03SStefano Zampini #endif
4905ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4906ed502f03SStefano Zampini       }
4907ed502f03SStefano Zampini     }
4908ed502f03SStefano Zampini 
4909ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
49109f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
49119f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4912ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
49137de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4914ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4915ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4916ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4917ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49189566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49199566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4920ed502f03SStefano Zampini     } else {
49219566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49229566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4923ed502f03SStefano Zampini     }
49249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49259566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49269566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4927ed502f03SStefano Zampini     c->maxnz         = c->nz;
4928ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4929ed502f03SStefano Zampini     c->rmax          = 0;
4930ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4931ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4932ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4933ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4934ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4935ed502f03SStefano Zampini     }
49369566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49379566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4938ed502f03SStefano Zampini     (*C)->nonzerostate++;
49399566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49409566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4941ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4942ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4943ed502f03SStefano Zampini   } else {
494408401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4945ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4946ed502f03SStefano Zampini     if (c->nz) {
4947ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49482c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4949aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
495008401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49519566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49529566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49535f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49545f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4955ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4956ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4957ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4958aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4959aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4960aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4961aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49622c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49632c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4964ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49659566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49662c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49679371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4968ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49699371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49702c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4971ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49729566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49731a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49745f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4975ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4976ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4977ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4978ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4979ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4980ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4981ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49821a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4983ed502f03SStefano Zampini       }
49849566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4985ed502f03SStefano Zampini     }
4986ed502f03SStefano Zampini   }
49879566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4988ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4989ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4990ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
49913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4992ed502f03SStefano Zampini }
4993c215019aSStefano Zampini 
4994d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4995d71ae5a4SJacob Faibussowitsch {
4996c215019aSStefano Zampini   bool               dmem;
4997c215019aSStefano Zampini   const PetscScalar *av;
4998c215019aSStefano Zampini 
4999c215019aSStefano Zampini   PetscFunctionBegin;
5000c215019aSStefano Zampini   dmem = isCudaMem(v);
50019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5002c215019aSStefano Zampini   if (n && idx) {
5003c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5004c215019aSStefano Zampini     widx.assign(idx, idx + n);
50059566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5006c215019aSStefano Zampini 
5007c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5008c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5009c215019aSStefano Zampini     if (dmem) {
5010c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5011c215019aSStefano Zampini     } else {
5012c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5013c215019aSStefano Zampini       dv = w->data();
5014c215019aSStefano Zampini     }
5015c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5016c215019aSStefano Zampini 
5017c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5018c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5019c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
502048a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5021c215019aSStefano Zampini     delete w;
5022c215019aSStefano Zampini   } else {
50239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5024c215019aSStefano Zampini   }
50259566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5028c215019aSStefano Zampini }
5029b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5030